-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine_features.py
113 lines (98 loc) · 4.43 KB
/
combine_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
#
# combine_features.py
#
# This program is specifically designed to combine the output of
# complementary_environmental_variables.py (at various buffer sizes) with the
# survey data generated by the Percept project:
# https://github.com/Spatial-Data-Science-and-GEO-AI-Lab/percept
# More details can be found in the README.md file.
#
# Copyright (2024): Matthew Danish
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
import os
import sys
import bz2
import pandas as pd
import geopandas as gpd
from pathlib import Path
import requests
import json
import argparse
from functools import reduce
parser = argparse.ArgumentParser(
prog='combine_features',
description='combine geojson/gpkg features')
parser.add_argument('--basedir', metavar='DIR', default=None, help='Directory to read cached and saved files from')
parser.add_argument('--buffer-sizes', metavar='METERS,METERS,...', default='100,300', help='Comma-separated list of buffer size (radius in meters) to read the saved data about')
parser.add_argument('--geojson', metavar='URL', default=None, help='URL to download GeoJSON file with geographic points to analyze.')
parser.add_argument('--geojson-url-in-file', metavar='FILE', default='url.txt', help='Get GeoJSON URL from this file if --geojson not specified.')
parser.add_argument('--quiet', action='store_true', help='quiet mode', default=False)
parser.add_argument('--output', '-o', metavar='FILE', help='Filename for output', required=True)
args = parser.parse_args()
def log(s, level=1, flush=False):
if args.quiet and level > 0: return
print(s, flush=flush)
if args.basedir is None:
basedir = Path(os.getcwd()) / "cev"
else:
basedir = args.basedir
def feature_stats_path(buffer_size): return basedir / Path(f'feature_stats_buffer{buffer_size}m.gpkg')
# GeoJSON data URL:
if args.geojson is not None:
url = args.geojson
elif args.geojson_url_in_file is not None:
with open(args.geojson_url_in_file) as fp:
url = fp.read().strip()
else:
log('No GeoJSON url supplied', level=0)
sys.exit(1)
buffer_sizes = list(map(int, args.buffer_sizes.split(',')))
for bufsize in buffer_sizes:
if not feature_stats_path(bufsize).exists():
log(f'Missing path: {feature_stats_path(bufsize)}')
sys.exit(1)
dataset_path = basedir / "dataset.gpkg"
def load_points(url):
if dataset_path.exists():
log(f'load_points: loading existing {dataset_path}')
return gpd.read_file(dataset_path)
else:
# Download the GeoJSON data
response = requests.get(url)
data = response.json()
# Create a GeoPandas DataFrame from the geojson data.
gdf = gpd.GeoDataFrame.from_features(data["features"])
gdf.crs = pyproj.CRS(4326)
# convert the CRS to a projected CRS
gdf = gdf.to_crs(pyproj.CRS(epsg))
gdf.to_file(dataset_path, driver="GPKG")
return gdf
# Read the feature stats at the given buffer size from the appropriate saved
# file within basedir, return a GeoDataFrame.
def load_feature_stats(bufsize):
gdf = gpd.read_file(feature_stats_path(bufsize))
gdf.rename(columns=lambda c: f'{c}_{bufsize}' if c not in ['image_id', 'geometry'] else c, inplace=True)
return gdf
# Merge two GeoDataFrames that are expected to have matching 'image_id' and
# 'geometry' columns, ensure that the output puts all of the columns from both
# dataframes into each matched row. Also includes any row that did not match in
# the other dataframe. The first parameter is a GeoDataFrame, the second
# parameter is a buffer size, which is used to read a specific file within
# basedir to load the 2nd GeoDataFrame.
def merge(gdf1, bufsize):
gdf2 = load_feature_stats(bufsize)
return gdf1.merge(gdf2, how='outer', on=['image_id','geometry'])
# Successively apply merge until all data loaded from all the saved files
# (named by buffer size) are in one GeoDataFrame.
gdf = reduce(merge,buffer_sizes,load_points(url))
log(f'Writing to file {args.output}')
gdf.to_file(args.output)