Water EDA report 2024_05_09.Rmd

---
title: "EDA report"
author: "Dan Myers"
date: "`r Sys.Date()`"
output: 
  html_document:
    toc: TRUE
    toc_float: TRUE
    number_sections: FALSE
    theme: darkly
---

This formal, organized report includes data summary statistics, results from exploratory data analysis, and visualizations of all NCRN water quality data. Preparation of this report will inform identification of missing, incomplete, or erroneous data and subsequent updates to the data workflow processes and to the NCRN Water Monitoring Protocol.

QC scripts can be found at <https://github.com/NCRN/WaterQC/tree/main>

### Load packages

```{r, message=F, warning=F}
library(dplyr)
library(tidyr)
library(lubridate)
library(sf)
library(mblm)
library(ggplot2)
library(plotly)
library(htmlwidgets)
library(DT)
```

### Read and format data

Make sure the EDD data (e.g., 20240129_wqp_wqx_bss_wq_npsncrn.csv) and the watershed conditions data (Watershed conditions 2024_03_27.csv) are both in the working directory.

```{r, message=F, warning=F}
### Water quality data #########################################################
# Read WQ data
fileName <- "20240129_wqp_wqx_bss_wq_npsncrn" # Leave out .csv extension
wdata <- read.csv(paste(fileName,".csv",sep=""))

# Format dates as date
wdata$ActivityStartDate <- as.Date(wdata$ActivityStartDate)

# Format result as numeric
wdata$ResultMeasureValue <- as.numeric(wdata$ResultMeasureValue)

# Remove extra rows
wdata <- wdata[wdata$ProjectIdentifier=="USNPS NCRN Perennial stream water monitoring",]
wdata <- wdata[!is.na(wdata$ResultMeasureValue),]
wdata <- wdata[!wdata$CharacteristicName %in% c("Chlorine",
  "Weather Condition (WMO Code 4501) (choice list)",                      
  "RBP2, Low G, Riparian Vegetative Zone Width, Left Bank (choice list)", 
  "RBP2, Low G, Riparian Vegetative Zone Width, Right Bank (choice list)"),]

# Take median of multiple measurements across stream
wdata_avgd <- wdata %>%
  group_by(MonitoringLocationIdentifier) %>%
  group_by(ActivityStartDate, .add=T) %>%
  group_by(CharacteristicName, .add=T) %>%
  summarise(ResultMeasureValue = median(ResultMeasureValue, na.rm=T))

# Remove sites not currently monitored
current_sites <- wdata_avgd %>%
  group_by(MonitoringLocationIdentifier) %>%
  summarise(start_date = min(ActivityStartDate, na.rm=T),
    end_date = max(ActivityStartDate, na.rm=T))
current_sites <- current_sites[year(current_sites$end_date)==2023,]
current_sites <- current_sites[current_sites$MonitoringLocationIdentifier !="NCRN_GWMP_SPRU",] # Remove this site with too little data
wdata_avgd <- wdata_avgd[wdata_avgd$MonitoringLocationIdentifier %in% current_sites$MonitoringLocationIdentifier,]

# Choose characteristic
CharacteristicName <- "Specific conductance" 


### If selecting only specific months of WQ data ###############################
# Save original before month selection
wdata_avgd_all <- wdata_avgd
desired_years <- min(year(wdata_avgd_all$ActivityStartDate), na.rm=T):max(year(wdata_avgd_all$ActivityStartDate), na.rm=T)
desired_months <- 1:12

# Select only certain months (comment or uncomment these, otherwise it will use all the months)
# desired_months <- c(5,6,7,8,9,10) # Warm season
# desired_months <- c(11,12,1,2,3,4) # Cold season
# desired_months <- c(1,4,7,10) ; desired_years <- c(2005:2018) # Quarterly through 2018

# Extract the data for those months
wdata_avgd <- wdata_avgd_all[month(wdata_avgd_all$ActivityStartDate) %in% desired_months &
                             year(wdata_avgd_all$ActivityStartDate) %in% desired_years,]


### Watershed LULC, road salt, and background SC data ##########################
# Load data
shed_sums <- read.csv("Watershed conditions 2024_03_27.csv") %>%
  rename(MonitoringLocationIdentifier=NEW_IMLOCI)
shed_sums <- shed_sums[shed_sums$MonitoringLocationIdentifier %in% current_sites$MonitoringLocationIdentifier,]

# Calculate total urban change (expansion)
urb_df <- data.frame(MonitoringLocationIdentifier=shed_sums$MonitoringLocationIdentifier, 
                     urb_chg=rowSums(shed_sums[,c(  "Change.Developed..open..21.",
                                                    "Change.Developed..low..22.",
                                                    "Change.Developed..medium..23.",
                                                    "Change.Developed..high..24.")]))

# Calculate urban intensification (infill)
inf <- data.frame(MonitoringLocationIdentifier=shed_sums$MonitoringLocationIdentifier,
                  open=NA,low=NA,med=NA,high=NA)
for (i in 1:nrow(shed_sums)){
  for (j in 1:ncol(shed_sums[,c(  "Change.Developed..open..21.",
                                  "Change.Developed..low..22.",
                                  "Change.Developed..medium..23.",
                                  "Change.Developed..high..24.")])){
    if ((shed_sums[,c(  "Change.Developed..open..21.",
                        "Change.Developed..low..22.",
                        "Change.Developed..medium..23.",
                        "Change.Developed..high..24.")]>0)[i,j]){
      inf[i,j+1] <- shed_sums[,c(  "Change.Developed..open..21.",
                                 "Change.Developed..low..22.",
                                 "Change.Developed..medium..23.",
                                 "Change.Developed..high..24.")][i,j]
    }
  }
}
urb_df$urb_inf <- rowSums(inf[,c("open","low","med","high")],na.rm=T)

# Calculate combined urbanization rate (expansion + infill)
urb_df$urb_com <- urb_df$urb_chg + urb_df$urb_inf

# Calculate total urban area (average of years)
urb_df$urb_tot <- rowSums((shed_sums[,c("NLCD06.Developed..open..21.",
                                        "NLCD06.Developed..low..22.",
                                        "NLCD06.Developed..medium..23.",
                                        "NLCD06.Developed..high..24.")] +
                             shed_sums[,c("NLCD19.Developed..open..21.",
                                          "NLCD19.Developed..low..22.",
                                          "NLCD19.Developed..medium..23.",
                                          "NLCD19.Developed..high..24.")])/2)

urb_df$ag_tot <- rowSums((shed_sums[,c("NLCD06.Pasture..81.", "NLCD06.Crops..82.")]+
                          shed_sums[,c("NLCD19.Pasture..81.", "NLCD19.Crops..82.")])/2)

urb_df$for_tot <- rowSums((shed_sums[,c("NLCD06.Deciduous.forest..41.",
                                        "NLCD06.Evergreen.forest..42.",
                                        "NLCD06.Mixed.forest..43.")]+
                             shed_sums[,c("NLCD19.Deciduous.forest..41.",
                                          "NLCD19.Evergreen.forest..42.",
                                          "NLCD19.Mixed.forest..43.")])/2)

# Add sub-types
urb_df$open <- (shed_sums$NLCD06.Developed..open..21. + shed_sums$NLCD19.Developed..open..21.)/2
urb_df$low <- (shed_sums$NLCD19.Developed..low..22. + shed_sums$NLCD19.Developed..low..22.)/2
urb_df$med <- (shed_sums$NLCD06.Developed..medium..23. + shed_sums$NLCD19.Developed..medium..23.)/2
urb_df$high <- (shed_sums$NLCD06.Developed..high..24. + shed_sums$NLCD19.Developed..high..24.)/2

# Format data frame and remove extra sites
urb_df <- urb_df[urb_df$MonitoringLocationIdentifier %in% current_sites$MonitoringLocationIdentifier,]


### Background SC and watershed conditions data ################################
# Calculate SC medians and IQR
for (i in 1:nrow(current_sites)){
  sel_data <- wdata_avgd$ResultMeasureValue[wdata_avgd$MonitoringLocationIdentifier==current_sites$MonitoringLocationIdentifier[i] &
                                              wdata_avgd$CharacteristicName==CharacteristicName]
  current_sites$medians_raw[i] <- median(sel_data,na.rm=T)
  current_sites$q1_raw[i] <- quantile(sel_data,0.25,na.rm=T)
  current_sites$q3_raw[i] <- quantile(sel_data,0.75,na.rm=T)
  current_sites$q90th_raw[i] <- quantile(sel_data,0.90,na.rm=T)
}

# Add background SC and watershed protected area
current_sites <- left_join(current_sites, shed_sums, by="MonitoringLocationIdentifier")
current_sites <- left_join(current_sites, urb_df, by="MonitoringLocationIdentifier")

# Correct for background SC
if (CharacteristicName=="Specific conductance"){
  current_sites$medians <- current_sites$medians_raw - current_sites$AvgPredEC
  current_sites$q1 <- current_sites$q1_raw - current_sites$AvgPredEC
  current_sites$q3 <- current_sites$q3_raw - current_sites$AvgPredEC
  current_sites$q90th <- current_sites$q90th_raw - current_sites$AvgPredEC
} else { 
  current_sites$medians <- current_sites$medians_raw
  current_sites$q1 <- current_sites$q1_raw
  current_sites$q3 <- current_sites$q3_raw
  current_sites$q90th <- current_sites$q90th_raw
}

# Shorten outputs
outputs <- data.frame(MonitoringLocationIdentifier=current_sites$MonitoringLocationIdentifier,
                      medians = current_sites$medians,
                      q1 = current_sites$q1,
                      q3 = current_sites$q3,
                      q90th = current_sites$q90th,
                      pct_prot = current_sites$Prot_area_pct..PAD., # Changed from NPS ownership to PAD
                      pct_urb = current_sites$urb_tot * 100,
                      pct_ag = current_sites$ag_tot * 100,
                      pct_for = current_sites$for_tot * 100)


### Road salt data #############################################################
# Calculate 2005-2019 avg salt
avg_salt <- data.frame(MonitoringLocationIdentifier=substr(current_sites$MonitoringLocationIdentifier,1,14), 
                       avg_lbs = rowMeans(shed_sums[,c(
                         "X2005","X2006","X2007","X2008","X2009","X2010",
                         "X2011","X2012","X2013","X2014","X2015","X2016",
                         "X2017","X2018","X2019")]), area_km = shed_sums$Area_km)
avg_salt$avg_kg = avg_salt$avg*0.453592 # convert lbs to kg

# Save for later
current_sites_save <- current_sites
```

### Time series plots

```{r, message=F, warning=F, fig.width=6.5, fig.height=7.5}

# Start list of plots
plotlist = list()

# Set up data frame and ticker
slopes_df <- data.frame(MonitoringLocationIdentifier=unique(current_sites$MonitoringLocationIdentifier[current_sites$MonitoringLocationIdentifier!="NCRN_GWMP_SPRU"]),
                        slopes_avg = NA, p_value = NA, MAD=NA, lci=NA, uci=NA, ci=NA)
slopes_ticker <- 1

# Start loop through figures
for (plot_num in 1:3){
  
  # Select site groups
  if (plot_num==1){
    site_nums <- 1:15
  } else if (plot_num==2){
    site_nums <- 16:30
  } else if (plot_num==3){
    site_nums <- 31:37
  }
  
  # Get site names
  sites_unique <- unique(current_sites$MonitoringLocationIdentifier)
  sites_unique <- sites_unique[sites_unique!="NCRN_GWMP_SPRU"]
  
  # Start ticker for legend placement
  ticker <- 1
  
  # Start loop through sites
  for (i in site_nums){
    
    # Choose site
    site <- sites_unique[i]
    
    # Query time series
    site_data <- wdata_avgd[wdata_avgd$MonitoringLocationIdentifier == site &
                              wdata_avgd$CharacteristicName == CharacteristicName,]
    
    # Format time series
    time_series = data.frame(
      Date = site_data$ActivityStartDate,
      Value = site_data$ResultMeasureValue) %>%
      arrange(Date)
    
    # Set up trend data
    x <- site_data$ActivityStartDate
    y <- site_data$ResultMeasureValue
    
    # Remove NAs
    x <- x[!is.na(x) & !is.na(y)]
    y <- y[!is.na(x) & !is.na(y)]
    
    # Convert to decimal date
    x <- decimal_date(x)
    
    # Remove zero and negative values
    x <- x[y>=1]
    time_series <- time_series[y>=1,]
    y <- y[y>=1] 
    
    # Create trend model for whole time period
    g <- mblm(y ~ x)
    
    # Calculate p-value and performance statistic from summary
    gs <- summary(g)
    slopes_df$p_value[slopes_ticker] <- gs$coefficients[2,4]
    slopes_df$MAD[slopes_ticker] <- gs$coefficients[2,2]
    
    # Add to data frame
    slopes_df$slopes_avg[slopes_ticker] <- gs$coefficients[2,1]
    
    # Add confidence interval
    uci <- confint(g)[2,2]
    lci <- confint(g)[2,1]
    ci <- uci - lci
    slopes_df$lci[slopes_ticker] <- lci
    slopes_df$uci[slopes_ticker] <- uci
    slopes_df$ci[slopes_ticker] <- ci
    
    # Up ticker
    slopes_ticker <- slopes_ticker + 1

  }
}

### Plot with ggplotly #############################################################

# Make special function (https://stackoverflow.com/questions/48349858/how-can-i-use-theil-sen-method-with-geom-smooth)
sen <- function(..., weights = NULL) {
  mblm::mblm(...)
}

# Start loop through figures
for (plot_num in 1:3){
  
  # Select site groups to plot and plot size
  if (plot_num==1){
    site_nums <- 1:15
  } else if (plot_num==2){
    site_nums <- 16:30
  } else if (plot_num==3){
    site_nums <- 31:37
  }
  
  # Get site names
  sites_unique <- unique(current_sites$MonitoringLocationIdentifier)
  sites_unique <- sites_unique[sites_unique!="NCRN_GWMP_SPRU"]
  ggsites <- sites_unique[site_nums]
  
  # Query time series
  site_data <- wdata_avgd[wdata_avgd$MonitoringLocationIdentifier %in% ggsites &
                            wdata_avgd$CharacteristicName == CharacteristicName,]
  
  # Format time series
  time_series = data.frame(
    Date = site_data$ActivityStartDate,
    Value = site_data$ResultMeasureValue,
    Site = site_data$MonitoringLocationIdentifier) %>%
    arrange(Date)
    
  # Remove NAs
  time_series <- time_series[!is.na(time_series$Date) & !is.na(time_series$Value),]
  
  # Convert to decimal date
  time_series$Date_dec <- decimal_date(time_series$Date)
  
  # Remove zero and negative values
  time_series <- time_series[time_series$Value>=1,]
  
  # Make ggplot
  a <- ggplot(time_series, aes(x=Date, y=Value))
  a <- a + geom_line(color="darkgrey") +
    labs(y="SC (µS/cm)") +
    geom_smooth(method=sen, color="gold")+
    facet_wrap(vars(Site), scales="free")
  
  # Save outputs
  plotlist[[plot_num]] = ggplotly(a)

}

# Print ggplotly
htmltools::tagList(setNames(plotlist, NULL))
```

### Watershed protection and trend analysis plots

```{r, message=F, warning=F, fig.width=5.5, fig.height=6.5}
# Assign plotting specs
ylabs <- "SC (µS/cm)"
ylim=1600
threshold <- 171

### Boxplot ##################################################################
# Pivot the data
pw <- wdata_avgd[wdata_avgd$CharacteristicName==CharacteristicName,] %>%
  pivot_wider(names_from="MonitoringLocationIdentifier",
              values_from="ResultMeasureValue")

# Create a NA data frame to convert the list to
pw_df <- data.frame() 

# Populate the data frame
for (i in 1:ncol(pw)){
  site_data <- unlist(pw[[i]])
  nrows <- length(site_data) 
  pw_df[1:nrows,i] <- site_data # Add column
}
colnames(pw_df) <- substr(colnames(pw),6,14)
pw_df <- pw_df[3:length(pw_df)]

# Sort sites by median
medians <- apply(pw_df,2,median, na.rm=T)
order1 <- row_number(medians)
order1_df <- data.frame(order1 = order1, sort1 = 1:length(order1)) %>% arrange(order1)
toPlot <- pw_df[,order1_df$sort1]


### Trend analysis plots #######################################################
### Join the outputs from the above calculations 
# Join LULC with WQ from time series and salt data
current_sites <- left_join(x=current_sites_save, y=slopes_df, by="MonitoringLocationIdentifier")

# Rename some fields
current_sites$NPS_pct <- current_sites$NPS.Watershed.Ownership.Percent
current_sites$sen_slope <- current_sites$slopes_avg
current_sites$mk_p <- current_sites$p_value

# Join with salt data and convert to tons
current_sites <- left_join(current_sites,avg_salt, by="MonitoringLocationIdentifier")
current_sites$avg_ton <- current_sites$avg_kg / 1000

# Turn-off scientific notation
options(scipen=999) 

# Add trends model 
current_sites$urb_tot2 <- current_sites$urb_tot*100 # Just displayed as percent

### Make ggplotly scatterplots #####################################################
# Make urban ggplot
a <- ggplot(current_sites, aes(x=(urb_tot*100), y=medians, text = MonitoringLocationIdentifier))
a <- a + geom_point() +
  geom_errorbar(aes(ymin=q1, ymax=q3))+
  labs(y="SC over background (µS/cm)", x="Urban (%)") +
  geom_smooth(method=lm, color="blue")
ggplotly(a, tooltip="text")


# Make forest ggplot
a <- ggplot(current_sites, aes(x=(for_tot*100), y=medians, text = MonitoringLocationIdentifier))
a <- a + geom_point() +
  geom_errorbar(aes(ymin=q1, ymax=q3))+
  labs(y="SC over background (µS/cm)", x="Forest (%)") +
  geom_smooth(method=lm, color="blue")
ggplotly(a, tooltip="text")


# Make protected area ggplot
a <- ggplot(current_sites, aes(x=Prot_area_pct..PAD., y=medians, text = MonitoringLocationIdentifier))
a <- a + geom_point() +
  geom_errorbar(aes(ymin=q1, ymax=q3))+
  labs(y="SC over background (µS/cm)", x="Protected area (%)") +
  geom_smooth(method=lm, color="blue")
ggplotly(a, tooltip="text")


# Make urban trends ggplot
a <- ggplot(current_sites, aes(x=(urb_tot*100), y=sen_slope, text = MonitoringLocationIdentifier))
a <- a + geom_point(aes(size=urb_com)) +
  geom_smooth(method=lm, color="blue") +
  labs(y="ΔSC (µS/cm per year)", x="Total urban area (% watershed)\nSize = urban growth rate 0-10%", subtitle="Size = urban growth rate 0-10%")
ggplotly(a, tooltip="text")


# Make salt trends ggplot
a <- ggplot(current_sites, aes(x=avg_ton, y=sen_slope, text = MonitoringLocationIdentifier))
a <- a + geom_point() +
  geom_smooth(method=lm, color="blue") +
  labs(y="ΔSC (µS/cm per year)", x="Salt applications (tons/km2/yr)")
ggplotly(a, tooltip="text")
```

### Output watershed conditions table

```{r, message=F, warning=F}
# Create watershed conditions data frame
wat <- data.frame(
  Site_name = substr(current_sites$MonitoringLocationIdentifier,6,14),
  Trees = round(current_sites$for_tot*100,0),
  Crops = round(current_sites$ag_tot*100,0),
  Urban = round(current_sites$urb_tot*100,0),
  Protect. = round(current_sites$Prot_area_pct..PAD.,0),
  Area = round(current_sites$Area_km,1)) %>%
  arrange(Site_name)

# Print outputs
# print(wat)
wat %>% datatable(
      style = "bootstrap", 
      class = "display nowrap",
    ) %>% 
      formatStyle(names(wat), `font-size` = '12px')
```

### Count number of measurements per site

```{r, message=F, warning=F}
# Extract data (SC)
data1 <- wdata_avgd[wdata_avgd$MonitoringLocationIdentifier %in% current_sites$MonitoringLocationIdentifier &
                      wdata_avgd$CharacteristicName ==CharacteristicName,]

# Summarize data (SC)
data1_sum <- data1 %>%
  group_by(MonitoringLocationIdentifier)%>%
  summarise(n = sum(ResultMeasureValue>-999, na.rm=T))

# Calculate mean and SD (SC)
mean(data1_sum$n)
sd(data1_sum$n)
```

### Plot with urban expansion on x axis

```{r, message=F, warning=F, fig.width=4, fig.height=4}
# Turn-off scientific notation
options(scipen=999) 

# Make urban trends ggplot
a <- ggplot(current_sites, aes(x=(urb_com*100), y=sen_slope, text = MonitoringLocationIdentifier))
a <- a + geom_point(aes(size=(urb_tot*100)), shape=1) +
  geom_smooth(method=lm, color="blue") +
  labs(y="ΔSC (µS/cm per year)", x="Urban growth rate\nSize = total urban area (0-100%)")
ggplotly(a, tooltip="text")
```

### Plots with imperviousness, road density, and other predictor options

```{r, message=F, warning=F, fig.width=6.5, fig.height=6.5}

# Create extra watershed conditions data frame
wat2 <- data.frame(
  Site_name = substr(current_sites$MonitoringLocationIdentifier,6,14),
  Impervious = round(current_sites$Watershed_Percent_Impervious_NLCD2019,0),
  Open_space = round(current_sites$open*100,0),
  Low_intensity = round(current_sites$low*100,0),
  Med_intensity = round(current_sites$med*100,0),
  High_intensity = round(current_sites$high*100,0),
  Road_density = round(current_sites$roads_km_km2,1)) %>%
  arrange(Site_name)

# Print outputs
wat2 %>% datatable(
      style = "bootstrap", 
      class = "display nowrap",
    ) %>% 
      formatStyle(names(wat2), `font-size` = '12px')
# Write to csv
# write.csv(wat2, "Watershed conditions table 2.csv", row.names=F)
```

### Make time series of all sites and characteristics

```{r, message=F, warning=F, fig.width=6.5, fig.height=7.5, warning=F}

# Start list of plots
plotlist = list()

# Select desired characteristics
all_chars <- c("Acid Neutralizing Capacity (ANC)",
               "Dissolved oxygen (DO)",
               "Temperature, water",
               "pH",
               "Total Nitrogen, mixed forms",
               "Total Phosphorus, mixed forms")

# Loop through multiple characteristics
for (CharacteristicName2 in all_chars){

  # Set up data frame and ticker
  slopes_df <- data.frame(MonitoringLocationIdentifier=unique(current_sites$MonitoringLocationIdentifier[current_sites$MonitoringLocationIdentifier!="NCRN_GWMP_SPRU"]),
                          slopes_avg = NA, p_value = NA, MAD=NA, lci=NA, uci=NA, ci=NA)
  slopes_ticker <- 1
  
  # Start loop through figures
  for (plot_num in 1:3){
    
    # Select site groups to plot and plot size
    if (plot_num==1){
      site_nums <- 1:15
      par(mfrow=c(5,3),mgp=c(2,1,0),mar=c(3,3,2,1))
    } else if (plot_num==2){
      site_nums <- 16:30
      par(mfrow=c(5,3),mgp=c(2,1,0),mar=c(3,3,2,1))
    } else if (plot_num==3){
      site_nums <- 31:37
      par(mfrow=c(5,3),mgp=c(2,1,0),mar=c(3,3,2,1))
    }
    
    # Get site names
    sites_unique <- unique(current_sites$MonitoringLocationIdentifier)
    sites_unique <- sites_unique[sites_unique!="NCRN_GWMP_SPRU"]
    
    # Start ticker for legend placement
    ticker <- 1
    
    # Start loop through sites
    for (i in site_nums){
      
      # Choose site
      site <- sites_unique[i]
      
      # Query time series
      site_data <- wdata_avgd[wdata_avgd$MonitoringLocationIdentifier == site &
                                wdata_avgd$CharacteristicName == CharacteristicName2,]
      
      # Remove early nutrient measurements
      if (CharacteristicName2 %in% c("Total Nitrogen, mixed forms", "Total Phosphorus, mixed forms")){
        site_data <- site_data[site_data$ActivityStartDate >= as.Date("2016-09-26"),]
      }
      
      # Format time series
      time_series = data.frame(
        Date = site_data$ActivityStartDate,
        Value = site_data$ResultMeasureValue) %>%
        arrange(Date)
      
      # Set up trend data
      x <- site_data$ActivityStartDate
      y <- site_data$ResultMeasureValue
      
      # Remove NAs
      x <- x[!is.na(x) & !is.na(y)]
      y <- y[!is.na(x) & !is.na(y)]
      
      # Convert to decimal date
      x <- decimal_date(x)
      
      # Remove zero and negative values
      # x <- x[y>=1]
      # time_series <- time_series[y>=1,]
      # y <- y[y>=1] 
      
      # Create trend model for whole time period
      g <- mblm(y ~ x)
      
      # Calculate p-value and performance statistic from summary
      gs <- summary(g)
      slopes_df$p_value[slopes_ticker] <- gs$coefficients[2,4]
      slopes_df$MAD[slopes_ticker] <- gs$coefficients[2,2]
      
      # Add to data frame
      slopes_df$slopes_avg[slopes_ticker] <- gs$coefficients[2,1]
      
      # Add confidence interval
      uci <- confint(g)[2,2]
      lci <- confint(g)[2,1]
      ci <- uci - lci
      slopes_df$lci[slopes_ticker] <- lci
      slopes_df$uci[slopes_ticker] <- uci
      slopes_df$ci[slopes_ticker] <- ci
      
      # Up ticker
      slopes_ticker <- slopes_ticker + 1
      
    }
  }
  
  ### Plot with ggplotly ###########################################################
  
  # Start loop through figures
  for (plot_num in 1:3){
    
    # Select site groups to plot and plot size
    if (plot_num==1){
      site_nums <- 1:15
    } else if (plot_num==2){
      site_nums <- 16:30
    } else if (plot_num==3){
      site_nums <- 31:37
    }
    
    # Get site names
    sites_unique <- unique(current_sites$MonitoringLocationIdentifier)
    sites_unique <- sites_unique[sites_unique!="NCRN_GWMP_SPRU"]
    ggsites <- sites_unique[site_nums]
    
    # Query time series
    site_data <- wdata_avgd[wdata_avgd$MonitoringLocationIdentifier %in% ggsites &
                              wdata_avgd$CharacteristicName == CharacteristicName2,]
    
    # Format time series
    time_series = data.frame(
      Date = site_data$ActivityStartDate,
      Value = site_data$ResultMeasureValue,
      Site = site_data$MonitoringLocationIdentifier) %>%
      arrange(Date)
      
    # Remove NAs
    time_series <- time_series[!is.na(time_series$Date) & !is.na(time_series$Value),]
    
    # Convert to decimal date
    time_series$Date_dec <- decimal_date(time_series$Date)
    
    # Remove zero and negative values
    # time_series <- time_series[time_series$Value>=1,]
    
    # Remove extreme water temp errors
    if (CharacteristicName2=="Temperature, water"){
      time_series <- time_series[time_series$Value<500,]
    }
    
    # Remove early nutrient measurements
    if (CharacteristicName2 %in% c("Total Nitrogen, mixed forms", "Total Phosphorus, mixed forms")){
      time_series <- time_series[time_series$Date >= as.Date("2016-09-26"),]
    }
    
    # Make ggplot
    a <- ggplot(time_series, aes(x=Date, y=Value))
    a <- a + geom_line(color="darkgrey") +
      labs(y=CharacteristicName2) +
      geom_smooth(method=sen, color="gold")+
      facet_wrap(vars(Site), scales="free")
    
  # Save outputs
  plotlist[[plot_num]] = ggplotly(a)
  
  }
}


# Print ggplotly
htmltools::tagList(setNames(plotlist, NULL))
```

### Make boxplots for all characteristics

```{r, message=F, warning=F}

# Start list of plots
plotlist = list()
ticker=1

# Select desired characteristics
all_chars2 <- c("Acid Neutralizing Capacity (ANC)",
               "Dissolved oxygen (DO)",
               "Temperature, water",
               "pH",
               "Total Nitrogen, mixed forms",
               "Total Phosphorus, mixed forms",
               "Specific conductance")

# Loop through multiple characteristics
for (CharacteristicName3 in all_chars2){

  ### Boxplot ##################################################################
  # Pivot the data
  pw <- wdata_avgd[wdata_avgd$CharacteristicName==CharacteristicName3,] %>%
    pivot_wider(names_from="MonitoringLocationIdentifier",
                values_from="ResultMeasureValue")
  
    # Remove early nutrient measurements
  if (CharacteristicName3 %in% c("Total Nitrogen, mixed forms", "Total Phosphorus, mixed forms")){
    pw <- pw[pw$ActivityStartDate >= as.Date("2016-09-26"),]
  }
  
  # Create a NA data frame to convert the list to
  pw_df <- data.frame() 
  
  # Populate the data frame
  for (i in 1:ncol(pw)){
    site_data <- unlist(pw[[i]])
    nrows <- length(site_data) 
    pw_df[1:nrows,i] <- site_data # Add column
  }
  colnames(pw_df) <- substr(colnames(pw),6,14)
  pw_df <- pw_df[3:length(pw_df)]
  
  # Sort sites by median
  medians <- apply(pw_df,2,median, na.rm=T)
  order1 <- row_number(medians)
  order1_df <- data.frame(order1 = order1, sort1 = 1:length(order1)) %>% arrange(order1)
  toPlot <- pw_df[,order1_df$sort1]
  
  ### Plot with ggplotly ###########################################################
  # Pivot back
  toPlot2 <- pivot_longer(toPlot, cols=colnames(toPlot))
  toPlot2 <- toPlot2[!is.na(toPlot2$value),]
  
  # Make ggplot
  a <- ggplot(toPlot2, aes(reorder(name,value,FUN=median), value))
  a <- a + geom_boxplot() +
    labs(y=CharacteristicName3, x="") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
  
  # Save outputs
  plotlist[[ticker]] = ggplotly(a)
  ticker <- ticker+1
  
  # Save widget
  # saveWidget(ggplotly(a), file = paste("Plots/",CharacteristicName3," boxplots.html",sep=""),background='r')
}

# Print ggplotly
htmltools::tagList(setNames(plotlist, NULL))

```