diff --git a/app/bgs_rules.py b/app/bgs_rules.py index 276ec89..b93ac78 100644 --- a/app/bgs_rules.py +++ b/app/bgs_rules.py @@ -184,16 +184,11 @@ def check_loca_within_great_britain(tables: dict) -> List[dict]: # Read data into geodataframe try: - location = tables['LOCA'].set_index('LOCA_ID') - location['geometry'] = list(zip(location['LOCA_NATE'], location['LOCA_NATN'])) + location = create_location_gpd(tables) except KeyError: # LOCA not present, already checked in earlier rule return errors - location['geometry'] = location['geometry'].apply(Point) - location = gpd.GeoDataFrame(location, geometry='geometry', crs='EPSG:27700') - location['line_no'] = range(1, len(location) + 1) - inside_uk_eea_mask = location.intersects(uk_eea_outline) inside_gb_mask = location.intersects(gb_outline) as_irish_grid = location.to_crs("EPSG:29903") @@ -225,6 +220,16 @@ def check_loca_within_great_britain(tables: dict) -> List[dict]: return errors +def create_location_gpd(tables: dict[pd.DataFrame]) -> gpd.GeoDataFrame: + location: pd.DataFrame = tables['LOCA'].set_index('LOCA_ID') + location['geometry'] = list(zip(location['LOCA_NATE'], location['LOCA_NATN'])) + location['geometry'] = location['geometry'].apply(Point) + location = gpd.GeoDataFrame(location, geometry='geometry', crs='EPSG:27700') + location['line_no'] = range(1, len(location) + 1) + + return location + + def check_locx_is_not_duplicate_of_other_column(tables: dict) -> List[dict]: """LOCA_LOCX and LOCA_LOCY are not duplicates of other columns""" diff --git a/app/borehole_map.py b/app/borehole_map.py new file mode 100644 index 0000000..1e5d70c --- /dev/null +++ b/app/borehole_map.py @@ -0,0 +1,89 @@ +""" +Functions used to generate a map of borehole locations by extracting a GeoJSON +representation of their metadata from the AGS files. +""" +from copy import copy +import json +from functools import reduce +import logging +from pathlib import Path + +import pandas as pd +import geopandas as gpd + +from app.checkers import load_tables_reporting_errors +from app.bgs_rules import create_location_gpd + +logger = logging.getLogger(__name__) + + +def extract_geojson(filepath: Path) -> dict: + """ + Read an AGS4 file and extract geojson represenation of LOCA table and + metadata. + """ + logger.info("Extracting geojson from %s", filepath.name) + + # Read data file + tables, load_error, _ = load_tables_reporting_errors(filepath) + if load_error: + raise ValueError(load_error) + + # Convert to geodataframe + try: + location: gpd.GeoDataFrame = create_location_gpd(tables) + except KeyError: + msg = f"ERROR: LOCA group missing from {filepath}" + raise ValueError(msg) + + # Add project columns and drop unwanted columns + try: + project: pd.DataFrame = tables['PROJ'] + except KeyError: + msg = f"ERROR: PROJ group missing from {filepath}" + raise ValueError(msg) + + for column in project.columns: + if column.startswith('PROJ_'): + # We assume that each file contains just one project + location[column] = project.loc[0, column] + + try: + location['PROJ_FILE_FSET'] = project.loc[0, 'FILE_FSET'] + location.rename(columns={'FILE_FSET': 'LOCA_FILE_FSET'}, inplace=True) + except KeyError: + logger.debug("No FILE_FSET for either/both PROJ and LOCA groups for %s", + filepath) + del location['HEADING'] + + # Create new ID from project and location IDs + location.reset_index(inplace=True) + location['ID'] = location['PROJ_ID'].str.cat(location['LOCA_ID'], sep='.') + location.set_index('ID', inplace=True) + + # Reproject to WGS84 + location = location.to_crs('EPSG:4326') + + # Return dict representation of geojson + return json.loads(location.to_json()) + + +def concantenate_feature_collections(feature_collections: list[dict]) -> dict: + """ + Concatenate feature collections, assuming collection metadata are all + the same. + """ + + def join_two(first_collection: dict, next_collection: dict) -> dict: + """ + Join collections by extending the features list. Use copy because + lists and dictionaries are mutable and we don't want to change the + input values. + """ + new_features: list[dict] = copy(first_collection['features']) + new_features.extend(next_collection['features']) + new_collection = first_collection.copy() + new_collection['features'] = new_features + return new_collection + + return reduce(join_two, feature_collections) diff --git a/app/checkers.py b/app/checkers.py index 9eff809..ed858a5 100644 --- a/app/checkers.py +++ b/app/checkers.py @@ -61,26 +61,15 @@ def check_bgs(filename: Path, **kwargs) -> dict: """ logger.info("Checking %s against BGS rules.", filename.name) errors = {} - error_message = None + load_error = None bgs_metadata = {} - try: - # Try to load and convert the file. Coordinate type errors replace - # empty dictionary from outer scope - tables, headers, errors = load_AGS4_as_numeric(filename) - except UnboundLocalError: - # This error is thrown in response to a bug in the upstream code, - # which in turn is only triggered if the AGS file has duplicate - # headers. - error_message = "ERROR: File contains duplicate headers" - except AGS4.AGS4Error as err: - error_message = str(err) - except IndexError: - error_message = "ERROR: File cannot be read, please use AGS checker to confirm format errors" + tables, load_error, ags4_errors = load_tables_reporting_errors(filename) - if error_message: - errors['File read error'] = [{'line': '-', 'group': '', 'desc': error_message}] + if load_error: + errors['File read error'] = [{'line': '-', 'group': '', 'desc': load_error}] else: + errors.update(ags4_errors) # Get additional metadata bgs_metadata = generate_bgs_metadata(tables) @@ -96,6 +85,28 @@ def check_bgs(filename: Path, **kwargs) -> dict: additional_metadata=bgs_metadata) +def load_tables_reporting_errors(filename): + tables = None + ags4_errors = {} + + try: + # Try to load and convert the file. Coordinate type errors replace + # empty dictionary from outer scope + tables, _, ags4_errors = load_ags4_as_numeric(filename) + load_error = None + except UnboundLocalError: + # This error is thrown in response to a bug in the upstream code, + # which in turn is only triggered if the AGS file has duplicate + # headers. + load_error = "ERROR: File contains duplicate headers" + except AGS4.AGS4Error as err: + load_error = str(err) + except IndexError: + load_error = "ERROR: File cannot be read, please use AGS checker to confirm format errors" + + return tables, load_error, ags4_errors + + def generate_bgs_metadata(tables: Dict[str, pd.DataFrame]) -> dict: """Generate additional metadata from groups.""" try: @@ -119,7 +130,7 @@ def generate_bgs_metadata(tables: Dict[str, pd.DataFrame]) -> dict: return bgs_metadata -def load_AGS4_as_numeric(filename: Path) -> Tuple[dict, dict, List[dict]]: +def load_ags4_as_numeric(filename: Path) -> Tuple[dict, dict, List[dict]]: """Read AGS4 file and convert to numeric data types.""" tables, headings = AGS4.AGS4_to_dataframe(filename) diff --git a/requirements.in b/requirements.in index 28d70c2..230af10 100644 --- a/requirements.in +++ b/requirements.in @@ -5,6 +5,7 @@ aiofiles colorlog geopandas numpy +geojson-pydantic pyproj python-ags4==0.5.0 requests diff --git a/requirements.txt b/requirements.txt index 2a2f35c..07569c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,6 +41,8 @@ fiona==1.9.5 # via # -r requirements.in # geopandas +geojson-pydantic==0.6.3 + # via -r requirements.in geopandas==0.14.3 # via -r requirements.in h11==0.14.0 @@ -76,6 +78,7 @@ pydantic==1.10.14 # via # -r requirements.in # fastapi + # geojson-pydantic pygments==2.17.2 # via rich pyproj==3.6.1 diff --git a/test/integration/test_api.py b/test/integration/test_api.py index 6111d50..8790411 100644 --- a/test/integration/test_api.py +++ b/test/integration/test_api.py @@ -14,7 +14,7 @@ from python_ags4 import AGS4 from app.main import app -from app.checkers import load_AGS4_as_numeric +from app.checkers import load_ags4_as_numeric import app.routes as app_routes from test.fixtures import (BAD_FILE_DATA, DICTIONARIES, FROZEN_TIME, GOOD_FILE_DATA) @@ -545,7 +545,7 @@ def test_get_ags_export_single_id(client, tmp_path): unzipped_ags_file = tmp_path / 'test.ags' with open(unzipped_ags_file, 'wb') as f: f.write(ags_file.read()) - tables, _, _ = load_AGS4_as_numeric(unzipped_ags_file) + tables, _, _ = load_ags4_as_numeric(unzipped_ags_file) assert tables['PROJ']['BGS_PROJ_ID'][0] == bgs_proj_id # Confirm the metadata file is correct with ags_zip.open(ags_metadata_file_name) as metadata_file: diff --git a/test/unit/test_bgs_rules.py b/test/unit/test_bgs_rules.py index 143d549..33d080c 100644 --- a/test/unit/test_bgs_rules.py +++ b/test/unit/test_bgs_rules.py @@ -4,7 +4,7 @@ import pytest from app.bgs_rules import BGS_RULES -from app.checkers import load_AGS4_as_numeric +from app.checkers import load_ags4_as_numeric from test.fixtures import BGS_RULES_ERRORS TEST_FILE_DIR = Path(__file__).parent.parent / 'files' @@ -16,7 +16,7 @@ def test_required_groups(): expected = {'line': '-', 'group': '', 'desc': 'Required groups not present: ABBR, TYPE, UNIT, (LOCA or HOLE)'} - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Required Groups'](tables) @@ -29,7 +29,7 @@ def test_required_bgs_groups(): expected = {'line': '-', 'group': '', 'desc': 'Required BGS groups not present: GEOL'} - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Required BGS Groups'](tables) @@ -42,7 +42,7 @@ def test_spatial_referencing(): expected = {'line': '-', 'group': 'LOCA', 'desc': 'Spatial referencing system not in LOCA_GREF, LOCA_LREF or LOCA_LLZ!'} - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Spatial Referencing'](tables) @@ -60,7 +60,7 @@ def test_eastings_northings_present(): 'group': 'LOCA', 'desc': 'LOCA_NATN contains zeros or null values'} ] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Eastings/Northings Present'](tables) @@ -78,7 +78,7 @@ def test_eastings_northings_range(): 'group': 'LOCA', 'desc': 'LOCA_NATN values outside 100,000 to 1,400,000 range'}, ] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Eastings/Northings Range'](tables) @@ -96,7 +96,7 @@ def test_drill_depth_present(): 'group': 'HDPH', 'desc': 'HDPH_BASE contains zero or null values'}, ] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Drill Depth Present'](tables) @@ -112,7 +112,7 @@ def test_drill_depth_geol_record(): {'line': '-', 'group': 'HDPH', 'desc': "GEOL LOCA_IDs not in HDPH group ({'BH109'})"}, ] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Drill Depth GEOL Record'](tables) @@ -150,7 +150,7 @@ def test_loca_within_great_britain(): 'group': 'LOCA', 'line': '6'}] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: LOCA within Great Britain'](tables) @@ -168,7 +168,7 @@ def test_loca_locx_is_not_duplicate_of_other_column(): 'group': 'LOCA', 'line': '-'}, ] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: LOCA_LOCX is not duplicate of other column'](tables) @@ -186,7 +186,7 @@ def test_loca_references_are_valid(): 'group': 'SAMP', 'line': '-'}, ] - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: LOCA_ID references'](tables) @@ -202,7 +202,7 @@ def test_non_numeric_coord_types(): "line": "-"} ]} - _, _, errors = load_AGS4_as_numeric(filename) + _, _, errors = load_ags4_as_numeric(filename) assert errors == expected @@ -216,7 +216,7 @@ def test_non_numeric_coord_types(): def test_sample_referential_integrity(filename, expected): # Arrange filename = TEST_FILE_DIR / 'bgs_rules' / filename - tables, _, _ = load_AGS4_as_numeric(filename) + tables, _, _ = load_ags4_as_numeric(filename) errors = BGS_RULES['BGS data validation: Sample Referencing'](tables) diff --git a/test/unit/test_borehole_map.py b/test/unit/test_borehole_map.py new file mode 100644 index 0000000..d93ff84 --- /dev/null +++ b/test/unit/test_borehole_map.py @@ -0,0 +1,93 @@ +""" +Tests for borehole_map.py +""" +from pathlib import Path + +from geojson_pydantic import FeatureCollection +import pytest + +from app.borehole_map import extract_geojson, concantenate_feature_collections + +TEST_FILE_DIR = Path(__file__).parent.parent / 'files' + + +def test_extract_geojson_example_ags(): + # Arrange + ags_filepath = TEST_FILE_DIR / 'example_ags.ags' + + # Act + result = extract_geojson(ags_filepath) + + # Assert + # Creation of FeatureCollection ensures correct fields exist + feature_collection = FeatureCollection(**result) + assert isinstance(feature_collection, FeatureCollection) + assert len(feature_collection) == 1 + + feature = feature_collection[0] + assert feature.properties['PROJ_ID'] == '121415' + assert feature.properties['LOCA_ID'] == '327-16A' + assert 'LOCA_FILE_FSET' in feature.properties + assert 'PROJ_FILE_FSET' in feature.properties + assert feature.id == '121415.327-16A' + lon, lat = feature.geometry.coordinates + assert -180 <= lon <= 180 + assert -90 <= lat <= 90 + + +@pytest.mark.parametrize('ags_filepath, expected_error', [ + (TEST_FILE_DIR / 'real' / 'AGS3' / 'Cowlairs park.ags', + 'ERROR: File contains duplicate headers'), + (TEST_FILE_DIR / 'real' / 'AGS3' / 'A4106.ags', + 'ERROR: LOCA group missing from '), + (TEST_FILE_DIR / 'real' / 'A487 Pont ar Dyfi Improvement.ags', + 'Line 106 does not have the same number of entries as the HEADING row in GEOL.'), + (TEST_FILE_DIR / 'real' / 'AGS3' / 'PE131061.ags', + 'ERROR: File cannot be read, please use AGS checker to confirm format errors'), +]) +def test_extract_geojson_bad_files(ags_filepath, expected_error): + # Act and assert + with pytest.raises(ValueError, match=expected_error): + extract_geojson(ags_filepath) + + +def test_concatenate_feature_collections(): + # Arrange + ashfield = extract_geojson(TEST_FILE_DIR / 'real' / + 'Ashfield Area C Development.ags') + servern = extract_geojson(TEST_FILE_DIR / 'real' / + 'Mount Severn- Environment Agency.ags') + wells_relief = extract_geojson(TEST_FILE_DIR / 'real' / + 'wells relief bh.ags') + site_list = [ashfield, servern, wells_relief] + total_features = sum(len(collection['features']) + for collection in site_list) + + # Act + result = concantenate_feature_collections(site_list) + + # Assert + # Creation of FeatureCollection ensures correct fields exist + feature_collection = FeatureCollection(**result) + assert isinstance(feature_collection, FeatureCollection) + + assert len(feature_collection) == total_features + + +""" +# This commented-out test can be used to attempt to parse all files and +# see the range of potential exceptions. + +@pytest.mark.skip(reason="Only used to find range of potential exceptions") +@pytest.mark.parametrize('ags_filepath', + list((TEST_FILE_DIR / 'real').glob('*.ags')) + ) +def test_extract_geojson_real_files(ags_filepath): + # Act + result = extract_geojson(ags_filepath) + + # Assert + # Creation of FeatureCollection ensures correct fields exist + feature_collection = FeatureCollection(**result) + assert isinstance(feature_collection, FeatureCollection) +"""