-
Notifications
You must be signed in to change notification settings - Fork 22
/
vector.py
1839 lines (1563 loc) · 65.8 KB
/
vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# coding: utf-8
# This code conducts vector subpixel shoreline extraction for DEA
# Coastlines:
#
# * Apply morphological extraction algorithms to mask annual median
# composite rasters to a valid coastal region
# * Extract waterline vectors using subpixel waterline extraction
# (Bishop-Taylor et al. 2019b; https://doi.org/10.3390/rs11242984)
# * Compute rates of coastal change at every 30 m of coastline
# using linear regression
import glob
import os
import sys
import warnings
import click
import pyproj
import odc.algo
import odc.geo.xr
import numpy as np
import pandas as pd
import xarray as xr
import geohash as gh
import geopandas as gpd
from affine import Affine
from rasterio.features import sieve
from scipy.stats import circstd, circmean, linregress
from shapely.geometry import box
from shapely.ops import nearest_points
from skimage.measure import label, regionprops
from skimage.morphology import (
black_tophat,
binary_closing,
binary_dilation,
binary_erosion,
dilation,
disk,
)
import datacube
from datacube.utils.aws import configure_s3_access
from coastlines.utils import configure_logging, load_config
from dea_tools.spatial import subpixel_contours, xr_vectorize, xr_rasterize
# Hide specific warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
pd.options.mode.chained_assignment = None
def load_rasters(
path,
raster_version,
study_area,
water_index="mndwi",
start_year=1988,
end_year=2021,
):
"""
Loads DEA Coastlines water index (e.g. 'MNDWI'), 'count',
and 'stdev' rasters for both annual and three-year gapfill data
into a consistent `xarray.Dataset` format for further analysis.
Parameters:
-----------
path : string
A string giving the directory containing raster outputs.
raster_version : string
A string giving the unique analysis name (e.g. 'v0.3.0') used
to load raster files.
study_area : string or int
A string giving the study area grid cell used to name raster files
(e.g. tile `6931`).
water_index : string, optional
A string giving the name of the water index to load. Defaults
to 'mndwi', which will load raster files produced using the
Modified Normalised Difference Water Index.
start_year : integer, optional
The first annual layer to include in the analysis. Defaults to
1988.
end_year : integer, optional
The final annual layer to include in the analysis. Defaults to
2021.
Returns:
--------
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual input rasters.
The dataset contains water index (e.g. 'MNDWI'),
'count', and 'stdev' arrays for each year from 1988 onward.
gapfill_ds : xarray.Dataset
An `xarray.Dataset` containing three-year gapfill rasters.
The dataset contains water index (e.g. 'MNDWI'),
'count', and 'stdev' arrays for each year from
`start_year` to `end_year`.
"""
# List to hold output Datasets
ds_list = []
for layer_type in [".tif", "_gapfill.tif"]:
# List to hold output DataArrays
da_list = []
for layer_name in [f"{water_index}", "count", "stdev"]:
# Get paths of files that match pattern
paths = glob.glob(
f"{path}/{raster_version}/"
f"{study_area}_{raster_version}/"
f"*_{layer_name}{layer_type}"
)
# Test if data was returned
if len(paths) == 0:
raise Exception(
f"No rasters found for grid cell {study_area} "
f"(raster version '{raster_version}'). Verify that "
f"`raster.py` has been run "
"for this grid cell."
)
# Create variable used for time axis
time_var = xr.Variable("year", [int(i.split("/")[-1][0:4]) for i in paths])
# Import data
layer_da = xr.concat(
[xr.open_dataset(i, engine="rasterio").band_data for i in paths],
dim=time_var,
)
layer_da.name = f"{layer_name}"
# Append to file
da_list.append(layer_da)
# Combine into a single dataset and restrict to start and end year
layer_ds = xr.merge(da_list).squeeze("band", drop=True)
layer_ds = layer_ds.sel(year=slice(str(start_year), str(end_year)))
# Append to list
ds_list.append(layer_ds)
return ds_list
def ocean_masking(ds, ocean_da, connectivity=1, dilation=None):
"""
Identifies ocean by selecting regions of water that overlap
with ocean pixels. This region can be optionally dilated to
ensure that the sub-pixel algorithm has pixels on either side
of the water index threshold.
Parameters:
-----------
ds : xarray.DataArray
An array containing True for land pixels, and False for water.
This can be obtained by thresholding a water index
array (e.g. MNDWI < 0).
ocean_da : xarray.DataArray
A supplementary static dataset used to separate ocean waters
from other inland water. The array should contain values of 1
for high certainty ocean pixels, and 0 for all other pixels
(land, inland water etc). For Australia, we use the Geodata
100K coastline dataset, rasterized as the "geodata_coast_100k"
product on the DEA datacube.
connectivity : integer, optional
An integer passed to the 'connectivity' parameter of the
`skimage.measure.label` function.
dilation : integer, optional
The number of pixels to dilate ocean pixels to ensure than
adequate land pixels are included for subpixel waterline
extraction. Defaults to None.
Returns:
--------
ocean_mask : xarray.DataArray
An array containing the a mask consisting of identified ocean
pixels as True.
"""
# Update `ocean_da` to mask out any pixels that are land in `ds` too
ocean_da = ocean_da & (ds != 1)
# First, break all time array into unique, discrete regions/blobs.
# Fill NaN with 1 so it is treated as a background pixel
blobs = xr.apply_ufunc(label, ds.fillna(1), 1, False, connectivity)
# For each unique region/blob, use region properties to determine
# whether it overlaps with a water feature from `water_mask`. If
# it does, then it is considered to be directly connected with the
# ocean; if not, then it is an inland waterbody.
ocean_mask = blobs.isin(
[i.label for i in regionprops(blobs.values, ocean_da.values) if i.max_intensity]
)
# Dilate mask so that we include land pixels on the inland side
# of each shoreline to ensure contour extraction accurately
# seperates land and water spectra
if dilation:
ocean_mask = xr.apply_ufunc(binary_dilation, ocean_mask, disk(dilation))
return ocean_mask
def coastal_masking(ds, ocean_da, buffer=33):
"""
Creates a symmetrical buffer around the land-water boundary
in a input boolean array. This is used to create a study area
mask that is focused on the coastal zone, excluding inland or
deeper ocean pixels.
Parameters:
-----------
ds : xarray.DataArray
A single time-step boolean array containing True for land
pixels, and False for water.
ocean_da : xarray.DataArray
A supplementary static dataset used to separate ocean waters
from other inland water. The array should contain values of 1
for high certainty ocean pixels, and 0 for all other pixels
(land, inland water etc). For Australia, we use the Geodata
100K coastline dataset, rasterized as the "geodata_coast_100k"
product on the DEA datacube.
buffer : integer, optional
The number of pixels to buffer the land-water boundary in
each direction.
Returns:
--------
coastal_mask : xarray.DataArray
An array containing True within `buffer_pixels` of the
land-water boundary, and False everywhere else.
"""
def _coastal_buffer(ds, buffer):
"""Generate coastal buffer from ocean-land boundary"""
buffer_ocean = binary_dilation(ds, buffer)
buffer_land = binary_dilation(~ds, buffer)
return buffer_ocean & buffer_land
# Identify ocean pixels based on overlap with the Geodata
# 100K coastline dataset
all_time_ocean = ocean_masking(ds, ocean_da)
# Generate coastal buffer from ocean-land boundary
coastal_mask = xr.apply_ufunc(
_coastal_buffer, all_time_ocean, disk(buffer), dask="parallelized"
)
# Return coastal mask as 1, and land pixels as 2
return coastal_mask.where(coastal_mask, ~all_time_ocean * 2)
def temporal_masking(ds):
"""
Create a temporal mask by identifying land pixels with a direct
spatial connection (e.g. contiguous) to land pixels in either the
previous or subsequent timestep.
This is used to clean up noisy land pixels (e.g. caused by clouds,
white water, sensor issues), as these pixels typically occur
randomly with no relationship to the distribution of land in
neighbouring timesteps. True land, however, is likely to appear
in proximity to land before or after the specific timestep.
Parameters:
-----------
ds : xarray.DataArray
A multi-temporal array containing True for land pixels, and
False for water.
Returns:
--------
temporal_mask : xarray.DataArray
A multi-temporal array array containing True for pixels
located within the `dilation` distance of land in at least
one neighbouring timestep.
"""
def _noncontiguous(labels, intensity):
# For each blob of land, obtain whether it intersected with land in
# any neighbouring timestep
region_props = regionprops(labels.values, intensity_image=intensity.values)
contiguous = [i.label for i in region_props if i.max_intensity == 0]
# Filter array to only contiguous land
noncontiguous_array = np.isin(labels, contiguous)
# Return as xr.DataArray
return xr.DataArray(
~noncontiguous_array, coords=labels.coords, dims=labels.dims
)
# Label independent groups of pixels in each timestep in the array
labelled_ds = xr.apply_ufunc(label, ds, None, 0, dask="parallelized").rename(
"labels"
)
# Check if a pixel was neighboured by land in either the
# previous or subsequent timestep by shifting array in both directions
masked_neighbours = (
(ds.shift(year=-1, fill_value=False) | ds.shift(year=1, fill_value=False))
.astype(int)
.rename("neighbours")
)
# Merge both into an xr.Dataset
label_neighbour_ds = xr.merge([labelled_ds, masked_neighbours])
# Apply continguity test to each year to obtain pixels that are
# contiguous (spatially connected to) to land in the previous or subsequent timestep
temporal_mask = label_neighbour_ds.groupby("year").apply(
lambda x: _noncontiguous(labels=x.labels, intensity=x.neighbours)
)
return temporal_mask
def _create_mask(raster_mask, sieve_size, crs):
"""
Clean and dilate an annual raster produced by `certainty_masking`,
then vectorize into a dictionary of vector features that are
taken as an input by `contour_certainty`.
"""
# Clean mask by sieving to merge small areas of pixels into
# their neighbours.
sieved = xr.apply_ufunc(sieve, raster_mask, sieve_size)
# Apply greyscale dilation to expand masked pixels and
# err on the side of overclassifying certainty issues
dilated = xr.apply_ufunc(dilation, sieved, disk(3))
# Vectorise
vector_mask = xr_vectorize(
dilated,
crs=crs,
attribute_col="certainty",
)
# Dissolve column, fix geometry and rename classes
vector_mask = vector_mask.dissolve("certainty")
vector_mask["geometry"] = vector_mask.geometry.buffer(0)
vector_mask = vector_mask.rename(
{0: "good", 1: "unstable data", 2: "insufficient data"}
)
return (raster_mask.year.item(), vector_mask)
def certainty_masking(yearly_ds, obs_threshold=5, stdev_threshold=0.3, sieve_size=128):
"""
Generate annual vector polygon masks containing information
about the certainty of each extracted shoreline feature.
These masks are used to assign each shoreline feature with
important certainty information to flag potential issues with
the data.
Parameters:
-----------
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual DEA Coastlines
rasters.
obs_threshold : int, optional
The minimum number of post-gapfilling Landsat observations
required for an extracted shoreline to be considered good
quality. Annual shorelines based on low numbers of
observations can be noisy due to the influence of
environmental noise like unmasked cloud, sea spray, white
water etc. Defaults to 5.
stdev_threshold : float, optional
The maximum MNDWI standard deviation required for a
post-gapfilled Landsat observation to be considered good
quality. Annual shorelines based on MNDWI with a high
standard deviation represent unstable data, which can
indicate that the tidal modelling process did not adequately
remove the influence of tide. For more information,
refer to Bishop-Taylor et al. 2021
(https://doi.org/10.1016/j.rse.2021.112734).
Defaults to 0.3.
sieve_size : int, optional
To reduce the complexity of the output masks, they are
first cleaned using `rasterio.features.sieve` to replace
small areas of pixels with the values of their larger
neighbours. This parameter sets the minimum polygon size
to retain in this process. Defaults to 128.
Returns:
--------
vector_masks : dictionary of geopandas.GeoDataFrames
A dictionary with year (as an str) as the key, and vector
data as a `geopandas.GeoDataFrame` for each year in the
analysis.
"""
from concurrent.futures import ProcessPoolExecutor
from itertools import repeat
# Identify problematic pixels
high_stdev = yearly_ds["stdev"] > stdev_threshold
low_obs = yearly_ds["count"] < obs_threshold
# Create raster mask with values of 0 for good data, values of
# 1 for unstable data, and values of 2 for insufficient data.
raster_mask = high_stdev.where(~low_obs, 2).astype(np.int16)
# Process in parallel
with ProcessPoolExecutor() as executor:
# Apply func in parallel, repeating params for each iteration
groups = [group for (i, group) in raster_mask.groupby("year")]
to_iterate = (
groups,
*(repeat(i, len(groups)) for i in [sieve_size, yearly_ds.odc.crs]),
)
vector_masks = dict(executor.map(_create_mask, *to_iterate), total=len(groups))
return vector_masks
def contour_certainty(contours_gdf, certainty_masks):
"""
Assigns a new certainty column to each annual shoreline feature
to identify features affected by:
1) Low satellite observations: annual shorelines based on less than
5 annual observations after gapfilling
2) Unstable MNDWI composites (potentially indicating tidal modelling
issues): annual shorelines with MNDWI standard deviation > 0.3
Parameters:
-----------
contours_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing annual shorelines.
certainty_masks : dictionary
A dictionary of annual certainty mask vector features, as
generated by `coastlines.vector.contours_preprocess`.
Returns:
--------
contours_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing annual shorelines with
a new "certainty" column/field.
"""
# Loop through each annual shoreline and attribute data with certainty
out_list = []
for year, _ in contours_gdf.iterrows():
# Extract year
contour_gdf = contours_gdf.loc[[year]]
# Assign each shoreline segment with attributes from certainty mask
contour_gdf = contour_gdf.overlay(
certainty_masks[year].reset_index(), how="intersection"
)
# Set year field and use as index
contour_gdf["year"] = year
contour_gdf = contour_gdf.set_index("year")
out_list.append(contour_gdf)
# Combine into a single dataframe
contours_gdf = pd.concat(out_list).sort_index()
# Finally, set all 1991 and 1992 coastlines north of -23 degrees
# latitude to 'uncertain' due to Mt Pinatubo aerosol issue
pinatubo_lat = (contours_gdf.centroid.to_crs("EPSG:4326").y > -23) & (
contours_gdf.index.isin([1991, 1992])
)
contours_gdf.loc[pinatubo_lat, "certainty"] = "aerosol issues"
return contours_gdf
def contours_preprocess(
yearly_ds,
gapfill_ds,
water_index,
index_threshold,
buffer_pixels=50,
mask_temporal=True,
mask_modifications=None,
debug=False,
):
"""
Prepares and preprocesses DEA Coastlines raster data to
restrict the analysis to coastal shorelines, and extract data
that is used to assess the certainty of extracted shorelines.
This function:
1) Identifies areas affected by either unstable composites or low data
2) Fills low data areas in annual layers with three-year gapfill
3) Computes an all-time coastal mask based on the observed timeseries
of water and land pixels, after first optionally cleaning the data
using land cover data, NDWI values and a temporal contiguity mask
4) Identifies pixels directly connected to the ocean in each annual
timestep, and uses this to remove inland waterbodies from the analyis
Parameters:
-----------
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual DEA Coastlines rasters.
gapfill_ds : xarray.Dataset
An `xarray.Dataset` containing three-year gapfill DEA Coastlines
rasters.
water_index : string
A string giving the name of the water index included in the
annual and gapfill datasets (e.g. 'mndwi').
index_threshold : float
A float giving the water index threshold used to separate land
and water (e.g. 0.00).
buffer_pixels : int, optional
The number of pixels by which to buffer the all time shoreline
detected by this function to produce an overall coastal buffer.
The default is 33 pixels, which at 30 m Landsat resolution
produces a coastal buffer with a radius of approximately 1000 m.
mask_temporal : bool, optional
Whether to apply a temporal contiguity mask by identifying land
pixels with a direct spatial connection (e.g. contiguous) to
land pixels in either the previous or subsequent timestep. This is
used to clean up noisy land pixels (e.g. caused by clouds,
white water, sensor issues), as these pixels typically occur
randomly with no relationship to the distribution of land in
neighbouring timesteps. True land, however, is likely to appear
in proximity to land before or after the specific timestep.
Defaults to True.
mask_modifications : geopandas.GeoDataFrame, optional
An optional polygon dataset including features to remove or add
to the all-time coastal mask. This should include a column/field
named 'type' that contains two possible values:
- 'add': features to add to the coastal mask (e.g. for
including areas of missing shorelines that were
previously removed by the coastal mask)
- 'remove': features to remove from the coastal mask (e.g.
areas of non-coastal rivers or estuaries,
irrigated fields or aquaculture that you wish
to exclude from the analysis)
debug : boolean, optional
Whether to return all intermediate layers for troubleshooting.
Returns:
--------
masked_ds : xarray.Dataset
A dataset containing water index data for each annual timestep
that has been masked to the coastal zone. This can then be used
as an input to subpixel waterline extraction.
certainty_masks : dict
A dictionary containg one `geopandas.GeoDataFrame` for each year
in the time period, with polygons identifying any potentially
problematic region. This is used to assign each output shoreline
with a certainty column.
"""
# Remove low obs pixels and replace with 3-year gapfill
combined_ds = yearly_ds.where(yearly_ds["count"] > 5, gapfill_ds)
# Set any pixels with only one observation to NaN, as these are
# extremely vulnerable to noise
combined_ds = combined_ds.where(yearly_ds["count"] > 1)
# Apply water index threshold and re-apply nodata values
nodata = combined_ds[water_index].isnull()
thresholded_ds = combined_ds[water_index] < index_threshold
thresholded_ds = thresholded_ds.where(~nodata)
# Compute temporal mask that restricts the analysis to land pixels
# with a direct spatial connection (e.g. contiguous) to land pixels
# in the previous or subsequent timestep. Set any pixels outside
# mask to 0 to represent water
temporal_mask = temporal_masking(thresholded_ds == 1)
thresholded_ds = thresholded_ds.where(temporal_mask)
# Create all time layers by identifying pixels that are land in at
# least 20% and 80% of valid observations; the 20% layer is used to
# identify pixels that contain land for even a small period of time;
# this is used for producing an all-time buffered "coastal" study area.
# The 80% layer is used to more conservatively identify pixels that
# are land for most of the time series years; this is used for extracting
# rivers as it better accounts for migrating/dynamic river channels.
all_time_20 = thresholded_ds.mean(dim="year") > 0.2
all_time_80 = thresholded_ds.mean(dim="year") > 0.8
# Identify narrow river and stream features using the `black_tophat`
# transform. To avoid narrow channels between islands and the
# mainland being mistaken for rivers, first apply `sieve` to any
# connected land pixels (i.e. islands) smaller than 5 km^2.
# Use a disk of size 8 to identify any rivers/streams smaller than
# approximately 240 m (e.g. 8 * 30 m = 240 m).
island_size = int(5000000 / (30 * 30)) # 5 km^2
sieved = xr.apply_ufunc(sieve, all_time_80.astype(np.int16), island_size)
rivers = xr.apply_ufunc(black_tophat, (sieved & all_time_80), disk(8)) == 1
# Create a river mask by eroding the all time layer to clip out river
# mouths, then expanding river features to include stream banks and
# account for migrating rivers
river_mouth_mask = xr.apply_ufunc(
binary_erosion, all_time_80.where(~rivers, True), disk(12)
)
rivers = rivers.where(river_mouth_mask, False)
river_mask = ~xr.apply_ufunc(binary_dilation, rivers, disk(4))
# Load Geodata 100K coastal layer to use to separate ocean waters from
# other inland waters. This product has values of 0 for ocean waters,
# and values of 1 and 2 for mainland/island pixels. We extract ocean
# pixels (value 0), then erode these by 10 pixels to ensure we only
# use high certainty deeper water ocean regions for identifying ocean
# pixels in our satellite imagery. If no Geodata data exists (e.g.
# over remote ocean waters, use an all True array to represent ocean.
try:
dc = datacube.Datacube()
geodata_da = dc.load(
product="geodata_coast_100k",
like=combined_ds.odc.geobox.compat,
).land.squeeze("time")
ocean_da = xr.apply_ufunc(binary_erosion, geodata_da == 0, disk(10))
except AttributeError:
ocean_da = odc.geo.xr.xr_zeros(combined_ds.odc.geobox) == 0
except ValueError: # Temporary workaround for no geodata access for tests
ocean_da = xr.apply_ufunc(binary_erosion, all_time_20 == 0, disk(20))
# Use all time and Geodata 100K data to produce the buffered coastal
# study area. The output has values of 0 representing non-coastal
# "ocean", values of 1 representing "coastal", and values of 2
# representing non-coastal "inland" pixels.
coastal_mask = coastal_masking(
ds=all_time_20.where(~rivers, True), ocean_da=ocean_da, buffer=buffer_pixels
)
# Add rivers as "inland" pixels in the coastal mask
coastal_mask = xr.where(river_mask, coastal_mask, 2)
# Optionally modify the coastal mask using manually supplied
# polygons to add missing areas of shoreline, or remove unwanted
# areas from the mask.
if mask_modifications is not None:
# Only proceed if there are polygons available
if len(mask_modifications.index) > 0:
# Convert type column to integer, with 1 representing pixels
# to add to the coastal mask (by setting them as "coastal"
# pixels, and 2 representing pixels to remove from the mask
# (by setting them as "inland data")
mask_modifications = mask_modifications.replace({"add": 1, "remove": 2})
# Rasterise polygons into extent of satellite data
modifications_da = xr_rasterize(
mask_modifications, da=yearly_ds, attribute_col="type"
)
# Where `modifications_da` has a value other than 0,
# replace values from `coastal_mask` with `modifications_da`
coastal_mask = coastal_mask.where(modifications_da == 0, modifications_da)
# Generate individual annual masks by selecting only water pixels that
# are directly connected to the ocean in each yearly timestep
annual_mask = (
# Treat both 1s and NaN pixels as land (i.e. True)
(thresholded_ds != 0)
# Mask out inland regions (i.e. values of 2 in `coastal_mask`)
.where(coastal_mask != 2)
# Keep pixels directly connected to ocean in each timestep
.groupby("year").map(
func=ocean_masking,
ocean_da=ocean_da,
connectivity=1,
dilation=3,
)
)
# Finally, apply temporal and annual masks to our surface water
# index data, then clip to "coastal" pixels in `coastal_mask`
masked_ds = combined_ds[water_index].where(
temporal_mask & annual_mask & (coastal_mask == 1)
)
# Generate annual vector polygon masks containing information
# about the certainty of each shoreline feature
certainty_masks = certainty_masking(combined_ds, stdev_threshold=0.3)
# Return all intermediate layers if debug=True
if debug:
return (
masked_ds,
certainty_masks,
all_time_20,
all_time_80,
river_mask,
ocean_da,
thresholded_ds,
temporal_mask,
annual_mask,
coastal_mask,
)
else:
return masked_ds, certainty_masks
def points_on_line(gdf, index, distance=30):
"""
Generates evenly-spaced point features along a specific line feature
in a `geopandas.GeoDataFrame`.
Parameters:
-----------
gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing line features with an
index and CRS.
index : string or int
An value giving the index of the line to generate points along
distance : integer or float, optional
A number giving the interval at which to generate points along
the line feature. Defaults to 30, which will generate a point
at every 30 metres along the line.
Returns:
--------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing point features at every
`distance` along the selected line.
"""
# Select individual line to generate points along
line_feature = gdf.loc[[index]].geometry
# If multiple features are returned, take unary union
if line_feature.shape[0] > 0:
line_feature = line_feature.unary_union
else:
line_feature = line_feature.iloc[0]
# Generate points along line and convert to geopandas.GeoDataFrame
points_line = [
line_feature.interpolate(i)
for i in range(0, int(line_feature.length), distance)
]
points_gdf = gpd.GeoDataFrame(geometry=points_line, crs=gdf.crs)
return points_gdf
def annual_movements(
points_gdf, contours_gdf, yearly_ds, baseline_year, water_index, max_valid_dist=1000
):
"""
For each rate of change point along the baseline annual coastline,
compute the distance to the nearest point on all neighbouring annual
coastlines and add this data as new fields in the dataset.
Distances are assigned a directionality (negative = located inland,
positive = located sea-ward) by sampling water index values from the
underlying DEA Coastlines rasters to determine if a coastline was
located in wetter or drier terrain than the baseline coastline.
Parameters:
-----------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing rates of change points.
contours_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing annual coastlines.
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual DEA CoastLines rasters.
baseline_year : string
A string giving the year used as the baseline when generating
the rates of change points dataset. This is used to load DEA
CoastLines water index rasters to calculate change
directionality.
water_index : string
A string giving the water index used in the analysis. This is
used to load DEA CoastLines water index rasters to calculate
change directionality.
max_valid_dist : int or float, optional
Any annual distance greater than this distance will be set
to `np.nan`.
Returns:
--------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing rates of change points
with added 'dist_*' attribute columns giving the distance to
each annual coastline from the baseline. Negative values
indicate that an annual coastline was located inland of the
baseline; positive values indicate the coastline was located
towards the ocean.
"""
def _point_interp(points, array, **kwargs):
points_gs = gpd.GeoSeries(points)
x_vals = xr.DataArray(points_gs.x, dims="z")
y_vals = xr.DataArray(points_gs.y, dims="z")
return array.interp(x=x_vals, y=y_vals, **kwargs)
# Get array of water index values for baseline time period
baseline_array = yearly_ds[water_index].sel(year=int(baseline_year))
# Copy baseline point geometry to new column in points dataset
points_gdf["p_baseline"] = points_gdf.geometry
# Years to analyse
years = contours_gdf.index.unique().values
# Iterate through all comparison years in contour gdf
for comp_year in years:
# Set comparison contour
comp_contour = contours_gdf.loc[[comp_year]].geometry.iloc[0]
# Find nearest point on comparison contour, and add these to points dataset
points_gdf[f"p_{comp_year}"] = points_gdf.apply(
lambda x: nearest_points(x.p_baseline, comp_contour)[1], axis=1
)
# Compute distance between baseline and comparison year points and add
# this distance as a new field named by the current year being analysed
distances = points_gdf.apply(
lambda x: x.geometry.distance(x[f"p_{comp_year}"]), axis=1
)
# Set any value over X m to NaN
points_gdf[f"dist_{comp_year}"] = distances.where(distances < max_valid_dist)
# Extract comparison array containing water index values for the
# current year being analysed
comp_array = yearly_ds[water_index].sel(year=int(comp_year))
# Sample water index values for baseline and comparison points
points_gdf["index_comp_p1"] = _point_interp(points_gdf.geometry, comp_array)
points_gdf["index_baseline_p2"] = _point_interp(
points_gdf[f"p_{comp_year}"], baseline_array
)
# Compute change directionality (positive = located towards the
# ocean; negative = located inland)
points_gdf["loss_gain"] = np.where(
points_gdf.index_baseline_p2 > points_gdf.index_comp_p1, 1, -1
)
# Ensure NaNs are correctly propagated (otherwise, X > NaN
# will return False, resulting in an incorrect land-ward direction)
is_nan = points_gdf[["index_comp_p1", "index_baseline_p2"]].isna().any(axis=1)
points_gdf["loss_gain"] = points_gdf["loss_gain"].where(~is_nan)
# Multiply distance to set change to negative, positive or NaN
points_gdf[f"dist_{comp_year}"] = (
points_gdf[f"dist_{comp_year}"] * points_gdf.loss_gain
)
# Calculate compass bearing from baseline to comparison point;
# first we need our points in lat-lon
lat_lon = points_gdf[["p_baseline", f"p_{comp_year}"]].apply(
lambda x: gpd.GeoSeries(x, crs=points_gdf.crs).to_crs("EPSG:4326")
)
geodesic = pyproj.Geod(ellps="WGS84")
bearings = geodesic.inv(
lons1=lat_lon.iloc[:, 0].values.x,
lats1=lat_lon.iloc[:, 0].values.y,
lons2=lat_lon.iloc[:, 1].values.x,
lats2=lat_lon.iloc[:, 1].values.y,
)[0]
# Add bearing as a new column after first restricting
# angles between 0 and 180 as we are only interested in
# the overall axis of our points e.g. north-south
points_gdf[f"bearings_{comp_year}"] = bearings % 180
# Calculate mean and standard deviation of angles
points_gdf["angle_mean"] = (
points_gdf.loc[:, points_gdf.columns.str.contains("bearings_")]
.apply(lambda x: circmean(x, high=180), axis=1)
.round(0)
.astype(int)
)
points_gdf["angle_std"] = (
points_gdf.loc[:, points_gdf.columns.str.contains("bearings_")]
.apply(lambda x: circstd(x, high=180), axis=1)
.round(0)
.astype(int)
)
# Keep only required columns
to_keep = points_gdf.columns.str.contains("dist|geometry|angle")
points_gdf = points_gdf.loc[:, to_keep]
points_gdf = points_gdf.assign(**{f"dist_{baseline_year}": 0.0})
points_gdf = points_gdf.round(2)
return points_gdf
def outlier_mad(points, thresh=3.5):
"""
Use robust Median Absolute Deviation (MAD) outlier detection
algorithm to detect outliers. Returns a boolean array with True if
points are outliers and False otherwise.
Parameters:
-----------
points :
An n-observations by n-dimensions array of observations
thresh :
The modified z-score to use as a threshold. Observations with a
modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask :
A n-observations-length boolean array.
References:
----------
Source: https://github.com/joferkington/oost_paper_code/blob/master/utilities.py
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if len(points.shape) == 1:
points = points[:, None]
median = np.median(points, axis=0)
diff = np.sum((points - median) ** 2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
def outlier_ransac(xy_df, **kwargs):
"""
Use the RANSAC (RANdom SAmple Consensus) algorithm to
robustly identify outliers. Returns a boolean array with True if
points are outliers and False otherwise.
Parameters:
-----------
points :
An n-observations by n-dimensions array of observations
**kwargs :
Any parameters to pass to
`sklearn.linear_model.RANSACRegressor`
Returns:
--------
mask :
A n-observations-length boolean array.
"""
from sklearn import linear_model
# X and y inputs
X = xy_df[:, 0].reshape(-1, 1)
y = xy_df[:, 1].reshape(-1, 1)
# Robustly fit linear model with RANSAC algorithm
ransac = linear_model.RANSACRegressor(**kwargs)
ransac.fit(X, y)
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
return outlier_mask
def change_regress(
y_vals,
x_vals,
x_labels,
threshold=3.5,
detrend_params=None,
slope_var="slope",
interc_var="intercept",
pvalue_var="pvalue",
stderr_var="stderr",
outliers_var="outliers",
):
"""
For a given row in a `pandas.DataFrame`, apply linear regression to
data values (as y-values) and a corresponding sequence of x-values,
and return 'slope', 'intercept', 'pvalue', and 'stderr' regression
parameters.
Before computing the regression, outliers are identified using a
robust Median Absolute Deviation (MAD) outlier detection algorithm,
and excluded from the regression. A list of these outliers will be
recorded in the output 'outliers' variable.
Parameters:
-----------
x_vals, y_vals : list of numeric values, or nd.array
A sequence of values to use as the x and y variables
x_labels : list
A sequence of strings corresponding to each value in `x_vals`.
This is used to label any observations that are flagged as
outliers (often, this can simply be set to the same list