Skip to content

Commit

Permalink
Merge pull request #137 from BritishGeologicalSurvey/32-geojson-extract
Browse files Browse the repository at this point in the history
32 geojson extract
  • Loading branch information
ximenesuk authored Mar 7, 2024
2 parents 5722cff + 8f84a7d commit 4ba8e73
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 38 deletions.
17 changes: 11 additions & 6 deletions app/bgs_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,16 +184,11 @@ def check_loca_within_great_britain(tables: dict) -> List[dict]:

# Read data into geodataframe
try:
location = tables['LOCA'].set_index('LOCA_ID')
location['geometry'] = list(zip(location['LOCA_NATE'], location['LOCA_NATN']))
location = create_location_gpd(tables)
except KeyError:
# LOCA not present, already checked in earlier rule
return errors

location['geometry'] = location['geometry'].apply(Point)
location = gpd.GeoDataFrame(location, geometry='geometry', crs='EPSG:27700')
location['line_no'] = range(1, len(location) + 1)

inside_uk_eea_mask = location.intersects(uk_eea_outline)
inside_gb_mask = location.intersects(gb_outline)
as_irish_grid = location.to_crs("EPSG:29903")
Expand Down Expand Up @@ -225,6 +220,16 @@ def check_loca_within_great_britain(tables: dict) -> List[dict]:
return errors


def create_location_gpd(tables: dict[pd.DataFrame]) -> gpd.GeoDataFrame:
location: pd.DataFrame = tables['LOCA'].set_index('LOCA_ID')
location['geometry'] = list(zip(location['LOCA_NATE'], location['LOCA_NATN']))
location['geometry'] = location['geometry'].apply(Point)
location = gpd.GeoDataFrame(location, geometry='geometry', crs='EPSG:27700')
location['line_no'] = range(1, len(location) + 1)

return location


def check_locx_is_not_duplicate_of_other_column(tables: dict) -> List[dict]:
"""LOCA_LOCX and LOCA_LOCY are not duplicates of other columns"""

Expand Down
89 changes: 89 additions & 0 deletions app/borehole_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Functions used to generate a map of borehole locations by extracting a GeoJSON
representation of their metadata from the AGS files.
"""
from copy import copy
import json
from functools import reduce
import logging
from pathlib import Path

import pandas as pd
import geopandas as gpd

from app.checkers import load_tables_reporting_errors
from app.bgs_rules import create_location_gpd

logger = logging.getLogger(__name__)


def extract_geojson(filepath: Path) -> dict:
"""
Read an AGS4 file and extract geojson represenation of LOCA table and
metadata.
"""
logger.info("Extracting geojson from %s", filepath.name)

# Read data file
tables, load_error, _ = load_tables_reporting_errors(filepath)
if load_error:
raise ValueError(load_error)

# Convert to geodataframe
try:
location: gpd.GeoDataFrame = create_location_gpd(tables)
except KeyError:
msg = f"ERROR: LOCA group missing from {filepath}"
raise ValueError(msg)

# Add project columns and drop unwanted columns
try:
project: pd.DataFrame = tables['PROJ']
except KeyError:
msg = f"ERROR: PROJ group missing from {filepath}"
raise ValueError(msg)

for column in project.columns:
if column.startswith('PROJ_'):
# We assume that each file contains just one project
location[column] = project.loc[0, column]

try:
location['PROJ_FILE_FSET'] = project.loc[0, 'FILE_FSET']
location.rename(columns={'FILE_FSET': 'LOCA_FILE_FSET'}, inplace=True)
except KeyError:
logger.debug("No FILE_FSET for either/both PROJ and LOCA groups for %s",
filepath)
del location['HEADING']

# Create new ID from project and location IDs
location.reset_index(inplace=True)
location['ID'] = location['PROJ_ID'].str.cat(location['LOCA_ID'], sep='.')
location.set_index('ID', inplace=True)

# Reproject to WGS84
location = location.to_crs('EPSG:4326')

# Return dict representation of geojson
return json.loads(location.to_json())


def concantenate_feature_collections(feature_collections: list[dict]) -> dict:
"""
Concatenate feature collections, assuming collection metadata are all
the same.
"""

def join_two(first_collection: dict, next_collection: dict) -> dict:
"""
Join collections by extending the features list. Use copy because
lists and dictionaries are mutable and we don't want to change the
input values.
"""
new_features: list[dict] = copy(first_collection['features'])
new_features.extend(next_collection['features'])
new_collection = first_collection.copy()
new_collection['features'] = new_features
return new_collection

return reduce(join_two, feature_collections)
45 changes: 28 additions & 17 deletions app/checkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,26 +61,15 @@ def check_bgs(filename: Path, **kwargs) -> dict:
"""
logger.info("Checking %s against BGS rules.", filename.name)
errors = {}
error_message = None
load_error = None
bgs_metadata = {}

try:
# Try to load and convert the file. Coordinate type errors replace
# empty dictionary from outer scope
tables, headers, errors = load_AGS4_as_numeric(filename)
except UnboundLocalError:
# This error is thrown in response to a bug in the upstream code,
# which in turn is only triggered if the AGS file has duplicate
# headers.
error_message = "ERROR: File contains duplicate headers"
except AGS4.AGS4Error as err:
error_message = str(err)
except IndexError:
error_message = "ERROR: File cannot be read, please use AGS checker to confirm format errors"
tables, load_error, ags4_errors = load_tables_reporting_errors(filename)

if error_message:
errors['File read error'] = [{'line': '-', 'group': '', 'desc': error_message}]
if load_error:
errors['File read error'] = [{'line': '-', 'group': '', 'desc': load_error}]
else:
errors.update(ags4_errors)
# Get additional metadata
bgs_metadata = generate_bgs_metadata(tables)

Expand All @@ -96,6 +85,28 @@ def check_bgs(filename: Path, **kwargs) -> dict:
additional_metadata=bgs_metadata)


def load_tables_reporting_errors(filename):
tables = None
ags4_errors = {}

try:
# Try to load and convert the file. Coordinate type errors replace
# empty dictionary from outer scope
tables, _, ags4_errors = load_ags4_as_numeric(filename)
load_error = None
except UnboundLocalError:
# This error is thrown in response to a bug in the upstream code,
# which in turn is only triggered if the AGS file has duplicate
# headers.
load_error = "ERROR: File contains duplicate headers"
except AGS4.AGS4Error as err:
load_error = str(err)
except IndexError:
load_error = "ERROR: File cannot be read, please use AGS checker to confirm format errors"

return tables, load_error, ags4_errors


def generate_bgs_metadata(tables: Dict[str, pd.DataFrame]) -> dict:
"""Generate additional metadata from groups."""
try:
Expand All @@ -119,7 +130,7 @@ def generate_bgs_metadata(tables: Dict[str, pd.DataFrame]) -> dict:
return bgs_metadata


def load_AGS4_as_numeric(filename: Path) -> Tuple[dict, dict, List[dict]]:
def load_ags4_as_numeric(filename: Path) -> Tuple[dict, dict, List[dict]]:
"""Read AGS4 file and convert to numeric data types."""
tables, headings = AGS4.AGS4_to_dataframe(filename)

Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ aiofiles
colorlog
geopandas
numpy
geojson-pydantic
pyproj
python-ags4==0.5.0
requests
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ fiona==1.9.5
# via
# -r requirements.in
# geopandas
geojson-pydantic==0.6.3
# via -r requirements.in
geopandas==0.14.3
# via -r requirements.in
h11==0.14.0
Expand Down Expand Up @@ -76,6 +78,7 @@ pydantic==1.10.14
# via
# -r requirements.in
# fastapi
# geojson-pydantic
pygments==2.17.2
# via rich
pyproj==3.6.1
Expand Down
4 changes: 2 additions & 2 deletions test/integration/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from python_ags4 import AGS4

from app.main import app
from app.checkers import load_AGS4_as_numeric
from app.checkers import load_ags4_as_numeric
import app.routes as app_routes
from test.fixtures import (BAD_FILE_DATA, DICTIONARIES, FROZEN_TIME,
GOOD_FILE_DATA)
Expand Down Expand Up @@ -545,7 +545,7 @@ def test_get_ags_export_single_id(client, tmp_path):
unzipped_ags_file = tmp_path / 'test.ags'
with open(unzipped_ags_file, 'wb') as f:
f.write(ags_file.read())
tables, _, _ = load_AGS4_as_numeric(unzipped_ags_file)
tables, _, _ = load_ags4_as_numeric(unzipped_ags_file)
assert tables['PROJ']['BGS_PROJ_ID'][0] == bgs_proj_id
# Confirm the metadata file is correct
with ags_zip.open(ags_metadata_file_name) as metadata_file:
Expand Down
26 changes: 13 additions & 13 deletions test/unit/test_bgs_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from app.bgs_rules import BGS_RULES
from app.checkers import load_AGS4_as_numeric
from app.checkers import load_ags4_as_numeric
from test.fixtures import BGS_RULES_ERRORS

TEST_FILE_DIR = Path(__file__).parent.parent / 'files'
Expand All @@ -16,7 +16,7 @@ def test_required_groups():
expected = {'line': '-',
'group': '',
'desc': 'Required groups not present: ABBR, TYPE, UNIT, (LOCA or HOLE)'}
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Required Groups'](tables)

Expand All @@ -29,7 +29,7 @@ def test_required_bgs_groups():
expected = {'line': '-',
'group': '',
'desc': 'Required BGS groups not present: GEOL'}
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Required BGS Groups'](tables)

Expand All @@ -42,7 +42,7 @@ def test_spatial_referencing():
expected = {'line': '-',
'group': 'LOCA',
'desc': 'Spatial referencing system not in LOCA_GREF, LOCA_LREF or LOCA_LLZ!'}
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Spatial Referencing'](tables)

Expand All @@ -60,7 +60,7 @@ def test_eastings_northings_present():
'group': 'LOCA',
'desc': 'LOCA_NATN contains zeros or null values'}
]
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Eastings/Northings Present'](tables)

Expand All @@ -78,7 +78,7 @@ def test_eastings_northings_range():
'group': 'LOCA',
'desc': 'LOCA_NATN values outside 100,000 to 1,400,000 range'},
]
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Eastings/Northings Range'](tables)

Expand All @@ -96,7 +96,7 @@ def test_drill_depth_present():
'group': 'HDPH',
'desc': 'HDPH_BASE contains zero or null values'},
]
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Drill Depth Present'](tables)

Expand All @@ -112,7 +112,7 @@ def test_drill_depth_geol_record():
{'line': '-', 'group': 'HDPH',
'desc': "GEOL LOCA_IDs not in HDPH group ({'BH109'})"},
]
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Drill Depth GEOL Record'](tables)

Expand Down Expand Up @@ -150,7 +150,7 @@ def test_loca_within_great_britain():
'group': 'LOCA',
'line': '6'}]

tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: LOCA within Great Britain'](tables)

Expand All @@ -168,7 +168,7 @@ def test_loca_locx_is_not_duplicate_of_other_column():
'group': 'LOCA',
'line': '-'},
]
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: LOCA_LOCX is not duplicate of other column'](tables)

Expand All @@ -186,7 +186,7 @@ def test_loca_references_are_valid():
'group': 'SAMP',
'line': '-'},
]
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: LOCA_ID references'](tables)

Expand All @@ -202,7 +202,7 @@ def test_non_numeric_coord_types():
"line": "-"}
]}

_, _, errors = load_AGS4_as_numeric(filename)
_, _, errors = load_ags4_as_numeric(filename)

assert errors == expected

Expand All @@ -216,7 +216,7 @@ def test_non_numeric_coord_types():
def test_sample_referential_integrity(filename, expected):
# Arrange
filename = TEST_FILE_DIR / 'bgs_rules' / filename
tables, _, _ = load_AGS4_as_numeric(filename)
tables, _, _ = load_ags4_as_numeric(filename)

errors = BGS_RULES['BGS data validation: Sample Referencing'](tables)

Expand Down
Loading

0 comments on commit 4ba8e73

Please sign in to comment.