Skip to content

Commit

Permalink
Merge pull request #39 from internetofwater/gage_meta
Browse files Browse the repository at this point in the history
Gage metadata additions.
  • Loading branch information
dblodgett-usgs authored Oct 8, 2024
2 parents a9be0ab + e980703 commit bc52b97
Show file tree
Hide file tree
Showing 7 changed files with 3,445 additions and 23 deletions.
86 changes: 75 additions & 11 deletions R/gage_locations.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ get_cdec_gage_locations <- function(gages) {
nhdpv2_COMID = comid_medres,
provider_id = id) |>
mutate(nhdpv2_REACH_measure = rep(NA_real_, n()),
nhdpv2_COMID = as.numeric(nhdpv2_COMID))
nhdpv2_COMID = as.numeric(nhdpv2_COMID),
nhdpv2_link_source = "https://cdec.water.ca.gov")
}

# gages <- targets::tar_read("co_gage")
Expand All @@ -117,31 +118,37 @@ get_co_gage_locations <- function(gages) {
select(provider_id = `DWR Abbrev`) |>
mutate(nhdpv2_REACHCODE = rep(NA_character_, n()),
nhdpv2_COMID = rep(NA_integer_, n()),
nhdpv2_REACH_measure = rep(NA_real_, n()))
nhdpv2_REACH_measure = rep(NA_real_, n()),
nhdpv2_link_source = rep(NA_character_, n()))

}

get_nwis_hydrolocations <- function(nhdpv2_gage,
swim_gage,
nwis_hydrolocation) {

nh <- read.csv(nwis_hydrolocation, colClasses = c("character",
"integer",
"character",
"numeric"))

nh <- mutate(nh, nhdpv2_link_source = "https://github.com/internetofwater/ref_gages/blob/main/data/nwis_hydrolocations.csv")

if(any(swim_gage$Gage_no %in% nh$provider_id)) stop("duplicates in override registry")

swim_gage <- sf::st_drop_geometry(swim_gage) |>
select(provider_id = Gage_no,
nhdpv2_COMID = COMID,
nhdpv2_REACHCODE = REACHCODE,
nhdpv2_REACH_measure = REACH_meas) |>
filter(nhdpv2_COMID != -9999)
filter(nhdpv2_COMID != -9999) |>
mutate(nhdpv2_link_source = "https://doi.org/10.5066/P9J5CK2Y")

nh <- bind_rows(nh, swim_gage)

nhdpv2_gage <- filter(st_drop_geometry(nhdpv2_gage),
!provider_id %in% nh$provider_id)
!provider_id %in% nh$provider_id) |>
mutate(nhdpv2_link_source = "https://www.epa.gov/waterdata/nhdplus-national-hydrography-dataset-plus")

bind_rows(nhdpv2_gage, nh)

Expand All @@ -154,7 +161,7 @@ get_nwis_hydrolocations <- function(nhdpv2_gage,
get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locations, nhdpv2_fline,
da_diff_thresh = 0.5, search_radius_m = 500,
max_matches_in_radius = 5) {

v2_area <- select(nhdplusTools::get_vaa(),
nhdpv2_COMID = comid,
nhdpv2_totdasqkm = totdasqkm)
Expand All @@ -171,6 +178,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
all_gages$nhdpv2_REACHCODE <- NA
all_gages$nhdpv2_REACH_measure <- NA
all_gages$nhdpv2_COMID <- NA
all_gages$nhdpv2_link_source <- NA
all_gages$nhdpv2_offset_m <- NA

for(hl in hydrologic_locations) {

Expand All @@ -188,6 +197,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
hl$locations$nhdpv2_REACH_measure
all_gages$nhdpv2_COMID[provider_selector][matcher] <-
hl$locations$nhdpv2_COMID
all_gages$nhdpv2_link_source[provider_selector][matcher] <-
hl$locations$nhdpv2_link_source

# Some gages missing reachcode/measure but have COMID
update_index <- is.na(all_gages$nhdpv2_REACH_measure & !is.na(all_gages$nhdpv2_COMID))
Expand All @@ -204,13 +215,33 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati

all_gages <- left_join(all_gages, v2_area, by = "nhdpv2_COMID")

diff_da <- abs(all_gages$nhdpv2_totdasqkm -
all_gages$drainage_area_sqkm) /
all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm

norm_diff_da <- all_gages$da_diff /
all_gages$drainage_area_sqkm

bad_da <- all_gages[!is.na(diff_da) & diff_da > da_diff_thresh, ]
abs_norm_diff_da <- abs(norm_diff_da)

bad_da <- all_gages[!is.na(all_gages$da_diff) & # has an estimate

((all_gages$drainage_area_sqkm <= 100 &
# use unnormalized because differences so quantized
# due to catchment resolution.
# when da_diff is negative, use within 25%
(all_gages$da_diff > 10 | (all_gages$da_diff < 0 & abs_norm_diff_da > 0.25))) |

# is something link ten or more catchments and within 10%
# drainage area threshold was selected based on distribution of catchment size.
(all_gages$drainage_area_sqkm > 100 &
abs_norm_diff_da > (0.1)) |

# is something like a hundred or more catchments and within 5%
# drainage area threshold was selected based on distribution of catchment size.
(all_gages$drainage_area_sqkm > 500 &
abs_norm_diff_da > (0.05))), ]

update_index <- which(is.na(all_gages$nhdpv2_COMID) |
!all_gages$nhdpv2_COMID %in% nhdpv2_fline$COMID |
all_gages$provider_id %in% bad_da$provider_id)

no_location <- all_gages[update_index, ]
Expand Down Expand Up @@ -251,18 +282,51 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
linked_gages <- select(no_location, provider_id) |>
mutate(id = seq_len(n())) |>
left_join(select(linked_gages_dedup,
id, COMID, REACHCODE, REACH_meas),
id, COMID, REACHCODE, REACH_meas, nhdpv2_totdasqkm, offset),
by = "id")

all_gages$nhdpv2_REACHCODE[update_index] <- linked_gages$REACHCODE
all_gages$nhdpv2_REACH_measure[update_index] <- linked_gages$REACH_meas
all_gages$nhdpv2_COMID[update_index] <- linked_gages$COMID
all_gages$nhdpv2_totdasqkm[update_index] <- linked_gages$nhdpv2_totdasqkm
all_gages$nhdpv2_link_source[update_index] <- rep("https://github.com/internetofwater/ref_gages", nrow(linked_gages))
all_gages$nhdpv2_offset_m[update_index] <- linked_gages$offset

all_gages$nhdpv2_totdasqkm <- round(all_gages$nhdpv2_totdasqkm, digits = 1)
all_gages$drainage_area_sqkm <- round(all_gages$drainage_area_sqkm, digits = 1)

all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm

all_gages
add_offset(all_gages, nhdpv2_fline)
}

add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) {
add_offset <- function(all_gages, nhdpv2_fline) {

missing_offset <- all_gages |>
filter(is.na(nhdpv2_offset_m) & !is.na(nhdpv2_REACHCODE))

missing_offset <- sf::st_transform(missing_offset, sf::st_crs(nhdpv2_fline))

new_indexes <- hydroloom::index_points_to_lines(nhdpv2_fline, sf::st_geometry(missing_offset),
search_radius = units::set_units(1000, "meters"),
ids = as.integer(missing_offset$nhdpv2_COMID))

missing_offset$point_id <- seq_len(nrow(missing_offset))

missing_offset <- left_join(missing_offset, new_indexes, by = "point_id")

missing_offset$nhdpv2_offset_m <- missing_offset$offset

missing_offset <- sf::st_transform(missing_offset, sf::st_crs(all_gages))

missing_offset[is.na(missing_offset$nhdpv2_offset_m), c('nhdpv2_REACHCODE', 'nhdpv2_REACH_measure','nhdpv2_COMID','nhdpv2_totdasqkm','nhdpv2_link_source')] <- NA

dplyr::bind_rows(filter(all_gages, !id %in% missing_offset$id),
select(missing_offset, all_of(names(all_gages))))
}

add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) {

mainstems <- mainstems[,c("head_nhdpv2_COMID", "uri"), drop = TRUE]
mainstems$head_nhdpv2_COMID <- as.integer(gsub("https://geoconnex.us/nhdplusv2/comid/", "",
mainstems$head_nhdpv2_COMID))
Expand Down
5 changes: 4 additions & 1 deletion R/registry.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ build_reference_location <- function(gl, reference_locations, registry, provider
if(nrow(loc) == 0) return(existing_locations)

# else return all the old plus some new
bind_rows(existing_locations, loc)
out <- bind_rows(existing_locations, loc)

write_csv(out, reference_locations)

out
}

48 changes: 48 additions & 0 deletions R/validation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
validate_ref_gage <- function(registry_csv, reference_file,
reference_locations_csv,
providers_lookup_csv, reference_out) {

registry <- read_csv(registry_csv)

reference <- read_sf(reference_file)

ref_locations <- read_csv(reference_locations_csv)

providers <- read_csv(providers_lookup_csv)

if(any(!reference$id %in% registry$id)) {
stop("missing ids in registry")
}

if(any(!reference$id %in% ref_locations$id)) {
stop("missing ids in ref locations")
}

if(any(!reference$provider %in% providers$provider)) {
stop("missing providers")
}

if(any(!registry$provider %in% providers$id)) {
stop("registry has unknown providers")
}

if(any(sapply(names(registry), \(x) any(is.na(registry[[x]]))))) {
stop("NAs not allowed in registry")
}

if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_offset_m))) {
stop("if a comid is identified it must have an offset")
}

if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_REACH_measure))) {
stop("if a comid is identified it must have a measure")
}

if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_link_source))) {
stop("if a comid is identified it must have a link source")
}

if(any(sf::st_is_empty(sf::st_geometry(reference)))) {
stop("all geometry must not be empty")
}
}
12 changes: 8 additions & 4 deletions R/write_out.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
write_reference <- function(gage_hydrologic_locations, registry, providers, reference_file, nldi_file,
duplicate_locations) {

duplicate_locations$cluster_string <- unlist(lapply(duplicate_locations$cluster_id, \(x) {
if(is.null(x)) return("")
paste(paste0("https://geoconnex.us/ref/gages/", x), collapse = ",")
Expand All @@ -18,13 +18,16 @@ write_reference <- function(gage_hydrologic_locations, registry, providers, refe
distinct()

if(any(duplicated(out$identifier))) stop("duplicate identifiers?")

out <- out |>
left_join(select(convert_provider_id(registry, providers),
uri, identifier, id), by = "identifier") %>%
select(id, uri, name, description, subjectOf,
provider, provider_id, nhdpv2_REACHCODE,
nhdpv2_REACH_measure, nhdpv2_COMID, mainstem_uri) %>%
nhdpv2_REACH_measure, nhdpv2_COMID, nhdpv2_totdasqkm,
nhdpv2_link_source, nhdpv2_offset_m,
gage_totdasqkm = drainage_area_sqkm,
dasqkm_diff = da_diff, mainstem_uri) %>%
mutate(id = as.integer(id)) |>
left_join(dup, by = "uri")

Expand All @@ -43,7 +46,8 @@ write_usgs_reference <- function(gage_hydrologic_locations, registry, providers,
left_join(select(convert_provider_id(registry, providers),
uri, identifier, id), by = "identifier") %>%
filter(provider == "https://waterdata.usgs.gov") %>%
select(id, uri, name, description, subjectOf, provider, provider_id, nhdpv2_REACHCODE, nhdpv2_REACH_measure, nhdpv2_COMID) %>%
select(id, uri, name, description, subjectOf, provider, provider_id,
nhdpv2_REACHCODE, nhdpv2_REACH_measure, nhdpv2_COMID) %>%
mutate(id = as.integer(id))

out$id <- out$provider_id
Expand Down
31 changes: 24 additions & 7 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@ library(targets)

tar_option_set(packages = c("nhdplusTools", "sf", "dplyr", "dataRetrieval",
"sbtools", "readr", "knitr", "mapview", "data.table"),
memory = "transient", garbage_collection = TRUE,
debug = "gage_hydrologic_locations_with_mainstems")
memory = "transient", garbage_collection = TRUE)

# primary output file for geoconnex reference server
reference_file <- "out/ref_gages.gpkg"

# registry csv file which is checked in
registry_csv <- "reg/ref_gages.csv"

# locations for all known reference gages
# https://github.com/internetofwater/ref_gages/issues/33
reference_locations_csv <- "reg/ref_locations.csv"

# contains information for each gage provider
providers_lookup_csv <- "reg/providers.csv"

# this is a set of location overrides
nwis_hydrolocation <- "data/nwis_hydrolocations.csv"

Expand Down Expand Up @@ -63,6 +73,8 @@ list(
pnw_gage)),

### location normalization ###
# these targets generate a normalized form set of gages from each source.

# This Gage layer from NHDPlusV2 is a basic starting point for
# NWIS gage locations.
tar_target("nhdpv2_gage", select(read_sf(nat_db, "Gage"),
Expand All @@ -84,19 +96,19 @@ list(
# Each entry will have a provider and provider_id that acts as a unique
# primary key. The existing registry file will have a unique attribute
# that contains that primary key.
tar_target("providers_csv", command = "reg/providers.csv", format = "file"),
tar_target("providers_csv", providers_lookup_csv, format = "file"),
tar_target("providers", read_csv(providers_csv)),


tar_target("registry", build_registry(gage_locations,
registry = "reg/ref_gages.csv",
registry = registry_csv,
providers = providers)),

# Also create a table of reference locations for the registered gages.
# unlike the registry, this may update to have the "best" location of a gage.
tar_target("ref_locations", build_reference_location(gage_locations,
reference_locations = "reg/ref_locations.csv",
registry = "reg/ref_gages.csv",
reference_locations = reference_locations_csv,
registry = registry_csv,
providers = providers)),

### spatial integration ###
Expand Down Expand Up @@ -134,4 +146,9 @@ list(
registry, providers, reference_file,
nldi_file,
duplicate_locations = duplicate_locations)),
tar_target("registry_out", write_registry(registry, "reg/ref_gages.csv")))
tar_target("registry_out", write_registry(registry, registry_csv)),

tar_target("validation", validate_ref_gage(registry_csv, reference_file,
reference_locations_csv,
providers_lookup_csv,
reference_out)))
Loading

0 comments on commit bc52b97

Please sign in to comment.