-
Notifications
You must be signed in to change notification settings - Fork 0
/
catFlow_buildData.R
195 lines (177 loc) · 11.4 KB
/
catFlow_buildData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
##catFlow: relationships between StreamCat landscape attributes and streamflow
##script to build data object joining NWIS/NHDplus, streamCat, GAGESII and flows
if (!require("dataRetrieval")) { install.packages("dataRetrieval", dependencies = TRUE); library(dataRetrieval)}
if (!require("httr")) { install.packages("httr", dependencies = TRUE); library(httr)}
if (!require("dplyr")) { install.packages("dplyr", dependencies = TRUE); library(dplyr)}
#### Cross reference NWIS gage numbers to NHDPlus2 COMIDs
# #Lookup built from NHDplus National GageInfo and GageLoc .dbfs, dropping unused cols and renaming
# # see /Users/Dan/Google Drive/NHDplus/NHDPlusNationalData/makeGageLookup.R
# # note 233 GAGEIDs/rows have COMIDs but not full info due to missing in original GageInfo table
# # 28164 by 18, COMID (integer); limited metadata relative to dataRetrieval functions...
# nwis2nhdp = readRDS(gzcon(url("https://github.com/daauerbach/catFlow/raw/master/NWIStoNHDplusV2.rds")))
#### Join StreamCat features to USGS gages
#### NOTE THESE STEPS REQUIRE CONSIDERABLE STORAGE AND TIME
# #build a national library of state.rds objects
# devtools::source_url("https://raw.githubusercontent.com/daauerbach/streamcatUtils/master/scu_getStreamCatST.R")
# setwd("/Users/Dan/Google Drive/NHDplus")
# for(s in state.abb[-grep("AK|HI", state.abb)]) getStreamCatST(s, dirOut = "catdata")
# #bind up all available units, removing duplicates
# devtools::source_url("https://raw.githubusercontent.com/daauerbach/streamcatUtils/master/scu_bindStreamCatST.R")
# usa = filter(bindStreamCatST(state.abb[-grep("AK|HI", state.abb)]
# ,dirCatdata = "/Users/Dan/Google Drive/NHDplus/catdata")
# , !duplicated(COMID))
# #join to previous...28164 by 240
# nwis2cat = left_join(nwis2nhdp, usa, by="COMID")
# ### Note 3400 in NHDplus lookup are missing from streamCat...
# ### mostly Hawaii (1280) and PR (611), but substantial numbers from FL, CA and ID...
# sort(table(filter(nwis2cat, is.na(nwis2cat$CatAreaSqKm))$STATE))
# ### original run: saveRDS(nwis2cat, "catFlowOrig/NWIS_NHDp_StreamCat_all.rds") #about 26mb
nwis2cat = readRDS(gzcon(url("https://github.com/daauerbach/catFlow/raw/master/NWIS_Sc_all.rds")))
#### Join the GAGESII data
#### GAGESII data built from "GAGES/basinchar_and_report_sept_2011/spreadsheets-in-csv-format"
# http://water.usgs.gov/GIS/dsdl/basinchar_and_report_sept_2011.zip
# and with spatial: http://water.usgs.gov/GIS/dsdl/gagesII_9322_point_shapefile.zip
#9067 by 335, col name added instead of rename to confirm match
g2 = readRDS(gzcon(url("https://github.com/daauerbach/catFlow/raw/master/GAGESII_9322_sept30_2011_tbldf.rds"))) %>% mutate(GAGEID = STAID)
# #Disagreements: 9067 with text attributes
# nrow(readRDS("GAGESII/spdf_gagesII_9322_sept30_2011.rds")) #9322 in the spatial points
# sum(table(nwis2cat$GagesII)) #9203 in the NHDplus lookup, of which 7217 nonref, 1986 ref
# sum(nwis2cat$GAGEID %in% g2$STAID) #9040...so 282 or 3% of 9322 are missing
## ORIGINAL VERSION (SUPPORTING TECH NOTE MSCPT) in catFlowOrig/catFlowWork.R
g2cat = left_join(g2, nwis2cat, by="GAGEID") #TRUE, for confirmation: identical(g2cat$STAID, g2cat$GAGEID)
rm(g2, nwis2cat) #clean up...
# #initially includes 7094 nonref, 1944 ref, 29 NA ??
# table(g2cat$GagesII, useNA = "always")
# #80 gages with no StreamCat data (due to the absences noted above in the NHDplus lookup)
# filter(g2cat, is.na(CatAreaSqKm)) %>% select(one_of(c("STANAME","LAT_GAGE","LNG_GAGE","HCDN-2009","STATION_NM","DASqKm","GagesII", "CatAreaSqKm", "WsAreaSqKm","PctAg2006Slp10Cat")))
##drop rows/gages: with no StreamCat data
##drop features: the local "Cat", a few duplicated names, yearly temp and precip
##8987 by 340; now 7047 nonref, 1938 ref, 2 NA; note some NA scattered through individual features
g2cat = filter(g2cat, !is.na(CatAreaSqKm)) %>%
select(-contains("Cat")) %>%
select(-GAGEID) %>%
select(-grep(".y$", names(g2cat))) %>%
select(-one_of(paste0("TMP",1950:2009,"_AVG"))) %>%
select(-one_of(paste0("PPT",1950:2009,"_AVG"))) %>%
mutate(PPTAVG_BASIN = 10*PPTAVG_BASIN, STOR_NOR_2009 = 1000*STOR_NOR_2009, STOR_NID_2009 = 1000*STOR_NID_2009)
#note that Sc and NWIS do not agree on the state for 83 obs
filter(g2cat, STATE.x != stabb)
#### Join preliminary streamflow numbers
#### PENDING REVISION WITH RERUN stateGages/stateMetrics() WRAPPER FUNCTIONS in catFlowOrig/catFlowWork.R
#### HERE JUST USING PREVIOUSLY BUILD OBJECTS FROM ORIGINAL stateMetrics() RUN
# # ## 'cfd' is cATfLOWdATA
# # cfd = left_join(g2cat
# # ,inner_join(do.call(rbind, readRDS("/Users/Dan/Google Drive/NHDplus/catFlowOrig/gagesCONUS.rds"))
# # ,do.call(rbind, readRDS("/Users/Dan/Google Drive/NHDplus/catFlowOrig/gagesCONUSmetrics.rds"))
# # , by="site_no") %>% rename(STAID = site_no)
# # ,by="STAID"
# # )
# #8987 by 378, including several redundant metadata features...
# saveRDS(cfd, "NWIS_G2_Sc_prelimFlows.rds")
pCodeToName[grep("Discharge", pCodeToName$description), 1:2]
## wrapper helpter to return tbl_df of gages/rows by site features
## single state allows easy aggregration back up to regions or other groupings
## only one state allowed at a time by webservice for whatNWISsites
## and limit on number of sites in one call to whatNWISdata (hence somewhat kludgy factor index in sites.data construction)
## played with metric calculation here (returning as additional cols)
## but keeping as separate function for modularity (and potential mclapply)
## setting statCd ensures only daily mean, which reduces duplicate rows from min/max/instant (http://help.waterdata.usgs.gov/stat_code)
stateGages = function(stabb
,endafter = as.Date("1985-01-01")
,nobs = 365*5
){
sites.all = whatNWISsites(stateCd=stabb, parameterCd = "00060")
sites.data = do.call(rbind,
lapply(split(x = sites.all$site_no, f = factor(rep(paste0(letters,rep(1:50,each=26)), each=50, length.out=nrow(sites.all))))
, function(sitevec) whatNWISdata(sitevec, service = "dv", parameterCd = "00060", statCd = "00003")
))
sites.data = tbl_df(filter(sites.data, end_date > endafter & count_nu > nobs))
return(sites.data)
} #end stateGages
# #str(stateGages("RI"))
# system.time(
# gages.st <- setNames(lapply(state.abb[-grep("AK|HI", state.abb)], function(st) stateGages(st))
# , state.abb[-grep("AK|HI", state.abb)])
# )
# # #after stat_cd enforcement, still a few non-unique site_no due to idiosyncrasies...
# # sapply(gages.st, nrow)
# # sapply(gages.st, function(s) nrow(distinct(s, site_no)))
# # lapply(gages.st, function(s) filter(s, duplicated(site_no))$site_no)
# # t(filter(gages.st[["OK"]], site_no == "07334200"))
# # #distinct() just chooses first, not largest obs count as desired: (distinct(gages.st[["OK"]], site_no) %>% filter(site_no == "07334200"))$count_nu
# ##So, can clean up by selecting the record with the largest count...
# ##but this may still blow up in the metrics?
# gages.st = lapply(gages.st, function(s) {
# if(any(duplicated(s$site_no))) {
# ds = s$site_no[duplicated(s$site_no)]
# ## make a copy of the not-duplicated gages
# ## this DOES catch the first occurrence of a duplicated site_no
# ## since ds is character not logical...
# s2 = filter(s, !(site_no %in% ds))
# for(i in unique(ds)) {
# s2 = rbind(s2, filter(s, site_no %in% i) %>% slice(which.max(count_nu)))
# }
# return(s2)
# } else {return(s)}
# })
# saveRDS(gages.st, "gagesCONUS.rds")
## follow up wrapper to return tbl_df of gages/rows by gage flow metrics; preallocated for certainty
## for one or a few stats, doesn't seem to make sense to hold the full record
## keeps it lighter and more reproducible for now
## lots of other useful info returned in readNWISdv call, including the lat/lon
## but just holding number of observations, number of missings obs, number of no flow days and a few quantiles
## began with inner_join() to other metadata before return, but seems to be potential for some duplicated rows/gages
## so returning just metrics to allow subsequent filter and join/merge
stateMetrics = function(stg #tbl_df of gages, as returned by stateGages
,endafter = "1985-01-01"
,prb = c(0.001, 0.01,0.1,0.25,0.5,0.95,.99)
){
fmet = matrix(numeric(), nrow = nrow(stg), ncol = 3+length(prb)
,dimnames = list(as.character(stg$site_no), c("nobs","nna","noflow", paste0("pcntl",prb)))
)
for(s in stg$site_no){
q = 0.0283168 * readNWISdv(s, parameterCd = "00060", startDate = endafter)$X_00060_00003
fmet[s,] = c(length(q), sum(is.na(q)), sum(q <= 0, na.rm = T)
,quantile(q, probs = prb, na.rm = T)
)
}
fmet = tbl_df(data.frame(site_no=stg$site_no, data.matrix(fmet), stringsAsFactors = F))
return(fmet)
} #end stateMetrics
#tester: gages.st.met <- setNames(lapply(gages.st[c("DE","RI")], stateMetrics), c("DE","RI"))
# #again, preallocated due to memory issues
# #loop fails periodically due to Windows not reclaiming memory
# #testing gc() shows no effect...so just babysitting
# setwd("catFlow")
# gages.st = readRDS("gagesCONUS.rds")
# #gages.st.met = setNames(lapply(names(gages.st), function(x) NULL), names(gages.st))
# gages.st.met = readRDS("gagesCONUSmetrics.rds")
# sapply(gages.st.met, dim)
# for(s in names(gages.st.met)[sapply(gages.st.met, is.null)]){
# print(s)
# print(nrow(gages.st[[s]]))
# gages.st.met[[s]] = stateMetrics(gages.st[[s]])
# saveRDS(gages.st.met, "gagesCONUSmetrics.rds")
# }
cfd = readRDS(gzcon(url("https://github.com/daauerbach/catFlow/raw/master/NWIS_G2_Sc_prelimFlows.rds")))
#Watershed focal [Sc] features
ffws = readRDS(gzcon(url("https://github.com/daauerbach/streamcatUtils/raw/master/focalfeatWsList.rds")))
##comparison features
#grep("NLCD06",names(g2cat), value = T); grep("2006Ws$",names(g2cat), value = T)
cf = list(
ids = c("LNG_GAGE","LAT_GAGE", "dec_lon", "dec_lat", "LonSite","LatSite"
,"COMID","STATION_NM","STATE_CD","Active","GagesII","HCDN-2009","HYDRO_DISTURB_INDX","BASIN_BOUNDARY_CONFIDENCE","SCREENING_COMMENTS")
,das = c("PCT_DIFF_NWIS","NWIS_DRAIN_SQKM","DRAIN_SQKM.x","DASqKm","WsAreaSqKm")
,hydc_g = c("PPTAVG_BASIN","T_MIN_BASIN","T_AVG_BASIN","T_MAX_BASIN", "RUNAVE7100","BFI_AVE")
,hydc_sc = c("PrecipWs","TminWs","TmeanWs","TmaxWs", "RunoffWs","BFIWs")
,soil_g = c("CLAYAVE","SANDAVE","OMAVE","ROCKDEPAVE","WTDEPAVE","PERMAVE","KFACT_UP")
,soil_sc = c("ClayWs","SandWs","OmWs","RckDepWs","WtDepWs","PermWs","KffactWs")
,nlcd06_g = c("IMPNLCD06","WATERNLCD06", "DEVLOWNLCD06","DEVMEDNLCD06","DEVHINLCD06", "DECIDNLCD06","EVERGRNLCD06","MIXEDFORNLCD06"
,"SHRUBNLCD06","GRASSNLCD06","PASTURENLCD06","CROPSNLCD06", "WOODYWETNLCD06","EMERGWETNLCD06")
,nlcd06_sc = c("PctImp2006Ws","PctOw2006Ws", "PctUrbLo2006Ws","PctUrbMd2006Ws","PctUrbHi2006Ws", "PctDecid2006Ws","PctConif2006Ws","PctMxFst2006Ws"
,"PctShrb2006Ws","PctGrs2006Ws","PctHay2006Ws","PctCrop2006Ws", "PctWdWet2006Ws","PctHbWet2006Ws")
,devel_g = c("STOR_NOR_2009","STOR_NID_2009","DDENS_2009", "CANALS_PCT","MINING92_PCT","NPDES_MAJ_DENS","ROADS_KM_SQ_KM","RD_STR_INTERS")
,devel_sc = c("DamNrmStorWs", "DamNIDStorWs", "DamDensWs", "CanalDensWs","MineDensWs","NPDESDensWs","RdDensWs","RdCrsWs")
,pop_g = c("PDEN_2000_BLOCK","PDEN_DAY_LANDSCAN_2007","PDEN_NIGHT_LANDSCAN_2007")
,pop_sc = "PopDen2010Ws"
)