combine_features.py

#!/usr/bin/env python3
#
# combine_features.py
#
# This program is specifically designed to combine the output of
# complementary_environmental_variables.py (at various buffer sizes) with the
# survey data generated by the Percept project:
#   https://github.com/Spatial-Data-Science-and-GEO-AI-Lab/percept
# More details can be found in the README.md file.
#
# Copyright (2024): Matthew Danish
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
# 
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
import os
import sys
import bz2
import pandas as pd
import geopandas as gpd
from pathlib import Path
import requests
import json
import argparse
from functools import reduce

parser = argparse.ArgumentParser(
                    prog='combine_features',
                    description='combine geojson/gpkg features')

parser.add_argument('--basedir', metavar='DIR', default=None, help='Directory to read cached and saved files from')
parser.add_argument('--buffer-sizes', metavar='METERS,METERS,...', default='100,300', help='Comma-separated list of buffer size (radius in meters) to read the saved data about')
parser.add_argument('--geojson', metavar='URL', default=None, help='URL to download GeoJSON file with geographic points to analyze.')
parser.add_argument('--geojson-url-in-file', metavar='FILE', default='url.txt', help='Get GeoJSON URL from this file if --geojson not specified.')
parser.add_argument('--quiet', action='store_true', help='quiet mode', default=False)
parser.add_argument('--output', '-o', metavar='FILE', help='Filename for output', required=True)

args = parser.parse_args()

def log(s, level=1, flush=False):
    if args.quiet and level > 0: return
    print(s, flush=flush)

if args.basedir is None:
    basedir = Path(os.getcwd()) / "cev"
else:
    basedir = args.basedir

def feature_stats_path(buffer_size): return basedir / Path(f'feature_stats_buffer{buffer_size}m.gpkg')

# GeoJSON data URL:
if args.geojson is not None:
    url = args.geojson
elif args.geojson_url_in_file is not None:
    with open(args.geojson_url_in_file) as fp:
        url = fp.read().strip()
else:
    log('No GeoJSON url supplied', level=0)
    sys.exit(1)

buffer_sizes = list(map(int, args.buffer_sizes.split(',')))
for bufsize in buffer_sizes:
    if not feature_stats_path(bufsize).exists():
        log(f'Missing path: {feature_stats_path(bufsize)}')
        sys.exit(1)

dataset_path = basedir / "dataset.gpkg"
def load_points(url):
    if dataset_path.exists():
        log(f'load_points: loading existing {dataset_path}')
        return gpd.read_file(dataset_path)
    else:
        # Download the GeoJSON data
        response = requests.get(url)
        data = response.json()

        # Create a GeoPandas DataFrame from the geojson data.
        gdf = gpd.GeoDataFrame.from_features(data["features"])
        gdf.crs = pyproj.CRS(4326)

        # convert the CRS to a projected CRS
        gdf = gdf.to_crs(pyproj.CRS(epsg))
        gdf.to_file(dataset_path, driver="GPKG")
        return gdf

# Read the feature stats at the given buffer size from the appropriate saved
# file within basedir, return a GeoDataFrame.
def load_feature_stats(bufsize):
    gdf = gpd.read_file(feature_stats_path(bufsize))
    gdf.rename(columns=lambda c: f'{c}_{bufsize}' if c not in ['image_id', 'geometry'] else c, inplace=True)
    return gdf

# Merge two GeoDataFrames that are expected to have matching 'image_id' and
# 'geometry' columns, ensure that the output puts all of the columns from both
# dataframes into each matched row. Also includes any row that did not match in
# the other dataframe. The first parameter is a GeoDataFrame, the second
# parameter is a buffer size, which is used to read a specific file within
# basedir to load the 2nd GeoDataFrame.
def merge(gdf1, bufsize):
    gdf2 = load_feature_stats(bufsize)
    return gdf1.merge(gdf2, how='outer', on=['image_id','geometry'])

# Successively apply merge until all data loaded from all the saved files
# (named by buffer size) are in one GeoDataFrame.
gdf = reduce(merge,buffer_sizes,load_points(url))

log(f'Writing to file {args.output}')
gdf.to_file(args.output)