09_occupancy-models.Rmd

---
editor_options: 
  chunk_output_type: console
---

# Modelling Species Occupancy 

### Load necessary libraries

```{r load_libraries, message=FALSE, warning=FALSE}
# Load libraries
# for ebird data
library(auk)
library(ebirdst)

# general data
library(tidyverse)
library(data.table)
library(lubridate)
library(openxlsx)
library(raster) # probably unnecessary

# for models
library(unmarked)
library(MuMIn)
library(AICcmodavg)
library(fields)

# for computation
library(doParallel)
library(snow)
library(ecodist)

# Source necessary functions
source("R/fun_screen_cor.R")
source("R/fun_model_estimate_collection.r")
```

## Load dataframe and prepare covariates

Here, we load the required dataframe that contains 10 random visits to a site and environmental covariates that were prepared at a spatial scale of 2.5 sq.km. We also scaled all covariates (mean around 0 and standard deviation of 1). Next, we ensured that only Traveling and Stationary checklists were considered. Even though stationary counts have no distance traveled, we defaulted all stationary accounts to an effective distance of 100m, which we consider the average maximum detection radius for most bird species in our area.  

```{r load_dataframe}
# Load in the prepared dataframe
dat <- fread("results/08_data-covars-2.5km.csv", header = T)
dat <- as_tibble(dat)
head(dat)
```

### Handle the sampling protocol

Select protocol and add 0.1 km to stationary checklists.

```{r handle_protocol}
# Some more pre-processing to get the right data structures

# Ensuring that only Traveling and Stationary checklists were considered
names(dat)
dat <- dat %>% filter(protocol_type %in% c("Traveling", "Stationary"))

# We take all stationary counts and give them a distance of 100 m (so 0.1 km),
# as that's approximately the max normal hearing distance for people doing point
# counts.
dat <- dat %>%
  mutate(effort_distance_km = if_else(
    effort_distance_km == 0 &
      protocol_type == "Stationary",
    0.1, effort_distance_km
  ))
```

### Handle time and date

Convert time and date to julian date and minutes since day.

```{r handle_time_obs_started}
# Converting time observations started to numeric and adding it as a new column
# This new column will be minute_observations_started
dat <-
  dat %>%
  mutate(
    min_obs_started = as.integer(
      as.difftime(
        time_observations_started,
        format = "%H:%M:%S", units = "mins"
      )
    )
  )

# Adding the julian date to the dataframe
dat <- dat %>%
  mutate(julian_date = lubridate::yday(observation_date))

# recode julian date to model it as a linear predictor
dat <- dat %>%
  mutate(
    newjulianDate =
      case_when(
        (julian_date >= 334 & julian_date) <= 365 ~
        (julian_date - 333),
        (julian_date >= 1 & julian_date) <= 152 ~
        (julian_date + 31)
      )
  ) %>%
  drop_na(newjulianDate)

# recode time observations started to model it as a linear predictor
dat <- dat %>%
  mutate(
    newmin_obs_started = case_when(
      min_obs_started >= 300 & min_obs_started <= 720 ~
      abs(min_obs_started - 720),
      min_obs_started >= 720 & min_obs_started <= 1140 ~
      abs(720 - min_obs_started)
    )
  ) %>%
  drop_na(newmin_obs_started)
```

### Scaling covariates

```{r select_columns_to_scale}
# Removing other unnecessary columns from the dataframe and creating a clean one without the rest
names(dat)
# select relevant columns BY NAME
dat <- dplyr::select(
  dat,
  c(
    "duration_minutes", "effort_distance_km", "locality",
    "locality_type", "locality_id", "observer_id",
    "observation_date", "scientific_name", "observation_count",
    "protocol_type", "number_observers", "pres_abs"
  ),

  # rename X-Y but NOTE THEY ARE IN UTM COORDINATES
  longitude = "X", latitude = "Y",
  expertise,

  # elevation and climate layers
  elev, bio4, bio15,

  # all LANDCOVER COLUMNS
  matches("lc"),

  # set new columns to old column names
  julian_date = "newjulianDate",
  min_obs_started = "newmin_obs_started"
)

# add year and convert presence-absence to integer
dat.1 <- dat %>%
  mutate(
    year = year(observation_date),
    pres_abs = as.integer(pres_abs)
  ) # occupancy modeling requires an integer response

# Scaling detection and occupancy covariates
dat.scaled <- dat.1

# Note: Never refer to columns by numbers, numbers change, names remain
cols_to_scale <- c(
  "duration_minutes", "effort_distance_km",
  "number_observers", "expertise", "elev", "bio4a", "bio15a", "lc_02", "lc_09", "lc_01", "lc_05", "lc_04", "lc_07", "lc_03", "julian_date", "min_obs_started"
)

# this scales the relevant columns between 0 and 1
dat.scaled <- mutate(
  dat.scaled,
  across(
    .cols = all_of(cols_to_scale), # referring to the columns
    .fns = scales::rescale # the rescale function
  )
)

# save data to file
fwrite(dat.scaled, "results/09_scaled-covars-2.5km.csv")
```

### Correct date format

```{r read_scaled_data}
# Reload the scaled covariate data
dat.scaled <- fread("results/09_scaled-covars-2.5km.csv", header = T)
dat.scaled <- as_tibble(dat.scaled)
head(dat.scaled)

# Ensure observation_date column is in the right format
dat.scaled$observation_date <- format(
  as.Date(
    dat.scaled$observation_date,
    "%m/%d/%Y"
  ),
  "%Y-%m-%d"
)
```

### Check for correlated covariates

```{r test_correlation}
# Testing for correlations before running further analyses
# Majority are uncorrelated since we decided to keep climatic and land cover predictors and removed elevation.
source("R/fun_screen_cor.R")

# SELECT COLUMNS to check BY NAME
cols_to_check <- c(
  "expertise", "bio4a", "bio15a", "lc_02",
  "lc_09", "lc_01",
  "lc_05", "lc_04", "lc_07", "lc_03"
)

# screen covariates for correlation
screen.cor(dat.scaled[, cols_to_check], threshold = 0.3)

# total number of presences by species
# min no. presences = 224 to max = 7725
presSpecies <- dat.scaled %>%
  group_by(scientific_name) %>%
  filter(pres_abs == "1") %>%
  summarise(n = n())

# convert locality_id to factors
dat.scaled$locality_id <- as.factor(dat.scaled$locality_id)
```

## Running a null model

```{r null_model}
# All null models are stored in lists below
all_null <- list()

# define species and a counter
species <- unique(dat.scaled$scientific_name)
counter <- 0

# Add a progress bar for the loop
pb <- txtProgressBar(
  min = 0,
  max = length(species),
  style = 3
) # text based bar

# loop over species
for (i in species) {

  # filter data by species
  data <- dat.scaled %>%
    filter(scientific_name == i)

  # Preparing data for the unmarked model
  occ <- filter_repeat_visits(data,
    min_obs = 1, max_obs = 10,
    annual_closure = FALSE,
    n_days = 1488, # 7 years is considered a period of closure
    date_var = "observation_date",
    site_vars = c("locality_id")
  )

  obs_covs <- c(
    "min_obs_started",
    "duration_minutes",
    "effort_distance_km",
    "number_observers",
    "expertise",
    "julian_date"
  )

  # format for unmarked
  occ_wide <- format_unmarked_occu(occ,
    site_id = "site",
    response = "pres_abs",
    site_covs = c(
      "locality_id", "lc_01", "lc_02", "lc_05",
      "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a"
    ),
    obs_covs = obs_covs
  )

  # Convert this dataframe of observations into an unmarked object to start fitting occupancy models
  occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu")

  # Set up the model
  # the list is now automatically named
  all_null[[i]] <- occu(~1 ~ 1, data = occ_um)

  # increase counter
  counter <- counter + 1

  setTxtProgressBar(pb, counter)
}
close(pb)

# Store all the  model outputs for each species
capture.output(all_null, file = "results/09_null_models.csv")
```

## Identifying covariates necessary to model the detection process

Here, we use the `unmarked` package in R (Fiske and Chandler 2011) to identify detection level covariates that are important for each species. We use AIC criteria to select top models (Burnham et al. 2002; 2011).

```{r prob_detection}
# All models are stored in lists below
det_dred <- list()

# Subsetting those models whose deltaAIC<4 (Burnham et al. 2011)
top_det <- list()

# Getting model averaged coefficients and relative importance scores
det_avg <- list()
det_imp <- list()

# Getting model estimates
det_modelEst <- list()

# Add a progress bar for the loop
pb <- txtProgressBar(
  min = 0,
  max = length(unique(dat.scaled$scientific_name)), style = 3
) # text based bar

for (i in 1:length(unique(dat.scaled$scientific_name))) {
  data <- dat.scaled %>%
    filter(dat.scaled$scientific_name == unique(dat.scaled$scientific_name)[i])

  # Preparing data for the unmarked model
  occ <- filter_repeat_visits(data,
    min_obs = 1, max_obs = 10,
    annual_closure = FALSE,
    n_days = 1488, # 7 years is considered a period of closure
    date_var = "observation_date",
    site_vars = c("locality_id")
  )

  obs_covs <- c(
    "min_obs_started",
    "duration_minutes",
    "effort_distance_km",
    "number_observers",
    "expertise",
    "julian_date"
  )

  # format for unmarked
  occ_wide <- format_unmarked_occu(occ,
    site_id = "site",
    response = "pres_abs",
    site_covs = c(
      "locality_id", "lc_01", "lc_02", "lc_05",
      "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a"
    ),
    obs_covs = obs_covs
  )

  # Convert this dataframe of observations into an unmarked object to start fitting occupancy models
  occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu")

  # Fit a global model with all detection level covariates
  global_mod <- occu(~ min_obs_started +
    julian_date +
    duration_minutes +
    effort_distance_km +
    number_observers +
    expertise ~ 1, data = occ_um)

  # Set up the cluster
  clusterType <- if (length(find.package("snow", quiet = TRUE))) "SOCK" else "PSOCK"
  clust <- try(makeCluster(getOption("cl.cores", 5), type = clusterType))

  clusterEvalQ(clust, library(unmarked))
  clusterExport(clust, "occ_um")

  det_dred[[i]] <- pdredge(global_mod, clust)
  names(det_dred)[i] <- unique(dat.scaled$scientific_name)[i]

  # Get the top models, which we'll define as those with deltaAICc < 4
  top_det[[i]] <- get.models(det_dred[[i]], subset = delta < 4, cluster = clust)
  names(top_det)[i] <- unique(dat.scaled$scientific_name)[i]

  # Obtaining model averaged coefficients
  if (length(top_det[[i]]) > 1) {
    a <- model.avg(top_det[[i]], fit = TRUE)
    det_avg[[i]] <- as.data.frame(a$coefficients)
    names(det_avg)[i] <- unique(dat.scaled$scientific_name)[i]


    det_modelEst[[i]] <- data.frame(
      Coefficient = coefTable(a, full = T)[, 1],
      SE = coefTable(a, full = T)[, 2],
      lowerCI = confint(a)[, 1],
      upperCI = confint(a)[, 2],
      z_value = (summary(a)$coefmat.full)[, 3],
      Pr_z = (summary(a)$coefmat.full)[, 4]
    )

    names(det_modelEst)[i] <- unique(dat.scaled$scientific_name)[i]

    det_imp[[i]] <- as.data.frame(MuMIn::importance(a))
    names(det_imp)[i] <- unique(dat.scaled$scientific_name)[i]
  } else {
    det_avg[[i]] <- as.data.frame(unmarked::coef(top_det[[i]][[1]]))
    names(det_avg)[i] <- unique(dat.scaled$scientific_name)[i]

    lowDet <- data.frame(lowerCI = confint(top_det[[i]][[1]], type = "det")[, 1])
    upDet <- data.frame(upperCI = confint(top_det[[i]][[1]], type = "det")[, 2])
    zDet <- data.frame(summary(top_det[[i]][[1]])$det[, 3])
    Pr_zDet <- data.frame(summary(top_det[[i]][[1]])$det[, 4])

    Coefficient <- coefTable(top_det[[i]][[1]])[, 1]
    SE <- coefTable(top_det[[i]][[1]])[, 2]

    det_modelEst[[i]] <- data.frame(
      Coefficient = Coefficient[2:length(Coefficient)],
      SE = SE[2:length(SE)],
      lowerCI = lowDet,
      upperCI = upDet,
      z_value = zDet,
      Pr_z = Pr_zDet
    )

    names(det_modelEst)[i] <- unique(dat.scaled$scientific_name)[i]
  }
  setTxtProgressBar(pb, i)
  stopCluster(clust)
}
close(pb)

## Storing output from the above models in excel sheets

# 1. Store all the model outputs for each species (variable: det_dred() - see above)
write.xlsx(det_dred, file = "results/09_det-dred.xlsx")

# 2. Store all the model averaged outputs for each species and the relative importance score
write.xlsx(det_avg, file = "results/09_det-avg.xlsx", rowNames = T, colNames = T)
write.xlsx(det_imp, file = "results/09_det-imp.xlsx", rowNames = T, colNames = T)

write.xlsx(det_modelEst, file = "results/09_det-modelEst.xlsx", rowNames = T, colNames = T)

# Note if you are unable to write to a file, use (for example)
a <- purrr::map(det_imp, ~ purrr::compact(.)) %>% purrr::keep(~ length(.) != 0)
```

## Land Cover and Climate

Occupancy models estimate the probability of occurrence of a given species while controlling for the probability of detection and allow us to model the factors affecting occurrence and detection independently. The flexible eBird observation process contributes to the largest source of variation in the likelihood of detecting a particular species; hence, we included seven covariates that influence the probability of detection for each checklist: ordinal day of year, duration of observation, distance traveled, protocol type, time observations started, number of observers and the checklist calibration index (CCI). 

Using a multi-model information-theoretic approach, we tested how strongly our occurrence data fit our candidate set of environmental covariates (Burnham et al. 2002). We fitted single-species occupancy models for each species, to simultaneously estimate a probability of detection ($\p$) and a probability of occupancy ($\psi$). For each species, we fit models, each with a unique combination of the climate and land cover occupancy covariates and all seven detection covariates. 

Across the models tested for each species, the model with highest support was determined using AICc scores. However, across the majority of the species, no single model had overwhelming support. Hence, for each species, we examined those models which had $\Delta$AICc < 4, as these top models were considered to explain a large proportion of the association between the species-specific probability of occupancy and environmental drivers (Burnham et al. 2011; Elsen et al. 2017). Using these restricted model sets for each species; we created a model-averaged coefficient estimate for each predictor and assessed its direction and significance. We considered a predictor to be significantly associated with occupancy if the range of the 95% confidence interval around the model-averaged coefficient did not contain zero.   
```{r lc_clim}
# All models are stored in lists below
lc_clim <- list()

# Subsetting those models whose deltaAIC<4 (Burnham et al. 2011)
top_lc_clim <- list()

# Getting model averaged coefficients and relative importance scores
lc_clim_avg <- list()
lc_clim_imp <- list()

# Storing Model estimates
lc_clim_modelEst <- list()

# Add a progress bar for the loop
pb <- txtProgressBar(min = 0, max = length(unique(dat.scaled$scientific_name)), style = 3) # text based bar

for (i in 1:length(unique(dat.scaled$scientific_name))) {
  data <- dat.scaled %>% filter(dat.scaled$scientific_name == unique(dat.scaled$scientific_name)[i])

  # Preparing data for the unmarked model
  occ <- filter_repeat_visits(data,
    min_obs = 1, max_obs = 10,
    annual_closure = FALSE,
    n_days = 1488, # 7 years is considered a period of closure
    date_var = "observation_date",
    site_vars = c("locality_id")
  )

  obs_covs <- c(
    "min_obs_started",
    "duration_minutes",
    "effort_distance_km",
    "number_observers",
    "expertise",
    "julian_date"
  )

  # format for unmarked
  occ_wide <- format_unmarked_occu(occ,
    site_id = "site",
    response = "pres_abs",
    site_covs = c(
      "locality_id", "lc_01", "lc_02", "lc_05",
      "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a"
    ),
    obs_covs = obs_covs
  )

  # Convert this dataframe of observations into an unmarked object to start fitting occupancy models
  occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu")

  model_lc_clim <- occu(~ min_obs_started +
    julian_date +
    duration_minutes +
    effort_distance_km +
    number_observers +
    expertise ~ lc_01 + lc_02 + lc_05 + lc_04 + lc_09 + lc_07 + lc_03 + 
      bio4a + bio15a, data = occ_um)

  # Set up the cluster
  clusterType <- if (length(find.package("snow", quiet = TRUE))) "SOCK" else "PSOCK"
  clust <- try(makeCluster(getOption("cl.cores", 5), type = clusterType))

  clusterEvalQ(clust, library(unmarked))
  clusterExport(clust, "occ_um")

  # Detection terms are fixed
  det_terms <- c(
    "p(duration_minutes)", "p(effort_distance_km)", "p(expertise)",
    "p(julian_date)", "p(min_obs_started)",
    "p(number_observers)"
  )

  lc_clim[[i]] <- pdredge(model_lc_clim, clust, fixed = det_terms)
  names(lc_clim)[i] <- unique(dat.scaled$scientific_name)[i]

  # Identiying top subset of models based on deltaAIC scores being less than 4 (Burnham et al., 2011)
  top_lc_clim[[i]] <- get.models(lc_clim[[i]], subset = delta < 4, cluster = clust)

  names(top_lc_clim)[i] <- unique(dat.scaled$scientific_name)[i]

  # Obtaining model averaged coefficients for both candidate model subsets
  if (length(top_lc_clim[[i]]) > 1) {
    a <- model.avg(top_lc_clim[[i]], fit = TRUE)
    lc_clim_avg[[i]] <- as.data.frame(a$coefficients)
    names(lc_clim_avg)[i] <- unique(dat.scaled$scientific_name)[i]

    lc_clim_modelEst[[i]] <- data.frame(
      Coefficient = coefTable(a, full = T)[, 1],
      SE = coefTable(a, full = T)[, 2],
      lowerCI = confint(a)[, 1],
      upperCI = confint(a)[, 2],
      z_value = (summary(a)$coefmat.full)[, 3],
      Pr_z = (summary(a)$coefmat.full)[, 4]
    )

    names(lc_clim_modelEst)[i] <- unique(dat.scaled$scientific_name)[i]

    lc_clim_imp[[i]] <- as.data.frame(MuMIn::importance(a))
    names(lc_clim_imp)[i] <- unique(dat.scaled$scientific_name)[i]
  } else {
    lc_clim_avg[[i]] <- as.data.frame(unmarked::coef(top_lc_clim[[i]][[1]]))
    names(lc_clim_avg)[i] <- unique(dat.scaled$scientific_name)[i]

    lowSt <- data.frame(lowerCI = confint(top_lc_clim[[i]][[1]], type = "state")[, 1])
    lowDet <- data.frame(lowerCI = confint(top_lc_clim[[i]][[1]], type = "det")[, 1])
    upSt <- data.frame(upperCI = confint(top_lc_clim[[i]][[1]], type = "state")[, 2])
    upDet <- data.frame(upperCI = confint(top_lc_clim[[i]][[1]], type = "det")[, 2])
    zSt <- data.frame(z_value = summary(top_lc_clim[[i]][[1]])$state[, 3])
    zDet <- data.frame(z_value = summary(top_lc_clim[[i]][[1]])$det[, 3])
    Pr_zSt <- data.frame(Pr_z = summary(top_lc_clim[[i]][[1]])$state[, 4])
    Pr_zDet <- data.frame(Pr_z = summary(top_lc_clim[[i]][[1]])$det[, 4])

    lc_clim_modelEst[[i]] <- data.frame(
      Coefficient = coefTable(top_lc_clim[[i]][[1]])[, 1],
      SE = coefTable(top_lc_clim[[i]][[1]])[, 2],
      lowerCI = rbind(lowSt, lowDet),
      upperCI = rbind(upSt, upDet),
      z_value = rbind(zSt, zDet),
      Pr_z = rbind(Pr_zSt, Pr_zDet)
    )

    names(lc_clim_modelEst)[i] <- unique(dat.scaled$scientific_name)[i]
  }
  gc()
  setTxtProgressBar(pb, i)
  stopCluster(clust)
}
close(pb)

# 1. Store all the model outputs for each species (for both landcover and climate)
write.xlsx(lc_clim, file = "results/09_lc-clim.xlsx")

# 2. Store all the model averaged outputs for each species and relative importance scores
write.xlsx(lc_clim_avg, file = "results/09_lc-clim-avg.xlsx", rowNames = T, colNames = T)
write.xlsx(lc_clim_imp, file = "results/09_lc-clim-imp.xlsx", rowNames = T, colNames = T)

# 3. Store all model estimates
write.xlsx(lc_clim_modelEst, file = "results/09_lc-clim-modelEst.xlsx", rowNames = T, colNames = T)

# Note if you are unable to write to a file, use (for example)
a <- purrr::map(lc_clim_modelEst, ~ purrr::compact(.)) %>% purrr::keep(~ length(.) != 0)
```

## Goodness-of-fit tests

Adequate model fit was assessed using a chi-square goodness-of-fit test using 1,000 parametric bootstrap simulations on a global model that included all occupancy and detection covariates (MacKenzie & Bailey, 2004). 
```{r}
goodness_of_fit <- data.frame()

# Add a progress bar for the loop
pb <- txtProgressBar(min = 0, max = length(unique(dat.scaled$scientific_name)), style = 3) # text based bar

for (i in 1:length(unique(dat.scaled$scientific_name))) {
  data <- dat.scaled %>% filter(dat.scaled$scientific_name == unique(dat.scaled$scientific_name)[i])

  # Preparing data for the unmarked model
  occ <- filter_repeat_visits(data,
    min_obs = 1, max_obs = 10,
    annual_closure = FALSE,
    n_days = 1488, # 7 years is considered a period of closure
    date_var = "observation_date",
    site_vars = c("locality_id")
  )

  obs_covs <- c(
    "min_obs_started",
    "duration_minutes",
    "effort_distance_km",
    "number_observers",
    "protocol_type",
    "expertise",
    "julian_date"
  )

  # format for unmarked
  occ_wide <- format_unmarked_occu(occ,
    site_id = "site",
    response = "pres_abs",
    site_covs = c(
      "locality_id", "lc_01", "lc_02", "lc_05",
      "lc_04", "lc_09", "lc_07", "lc_03", "bio4a", "bio15a"
    ),
    obs_covs = obs_covs
  )

  # Convert this dataframe of observations into an unmarked object to start fitting occupancy models
  occ_um <- formatWide(occ_wide, type = "unmarkedFrameOccu")

  model_lc_clim <- occu(~ min_obs_started +
    julian_date +
    duration_minutes +
    effort_distance_km +
    number_observers +
    protocol_type +
    expertise ~ lc_01 + lc_02 + lc_05 +
    lc_04 + lc_09 + lc_07 + lc_03 + bio4a + bio15a, data = occ_um)

  # note: reduce nsim as this takes a very long time even with parallelization
  occ_gof <- mb.gof.test(model_lc_clim,
    nsim = 1000, parallel = T, ncores = 5,
    plot.hist = FALSE
  )

  p.value <- occ_gof$p.value
  c.hat <- occ_gof$c.hat.est
  scientific_name <- unique(data$scientific_name)

  a <- data.frame(scientific_name, p.value, c.hat)

  goodness_of_fit <- rbind(a, goodness_of_fit)

  setTxtProgressBar(pb, i)
}
close(pb)

write.csv(goodness_of_fit, "results/09_goodness-of-fit-2.5km.csv", row.names = F)
```