diff --git a/analysis_paf.R b/analysis_paf.R new file mode 100644 index 0000000..2b44333 --- /dev/null +++ b/analysis_paf.R @@ -0,0 +1,3 @@ +### Proportion of affected species + +# prop_critical ~ sites + (1|method) + (1|richness) \ No newline at end of file diff --git a/codes/analysis_occurrence_data.R b/codes/analysis_occurrence_data.R index 06ae69f..d9f1f8d 100644 --- a/codes/analysis_occurrence_data.R +++ b/codes/analysis_occurrence_data.R @@ -1,10 +1,10 @@ #################### eDNA expeditions - scientific analysis #################### ########################## Environmental data download ######################### # January of 2024 -# Author: Mike Burrows -# Contact: michael.burrows@sams.ac.uk +# Author: Silas Principe, Mike Burrows +# Contact: s.principe@unesco.org; michael.burrows@sams.ac.uk # -########## Read and analyse eDNA data from UNESCO World Heritage Sites ######### +############## Plot eDNA data from UNESCO World Heritage Sites ################# # Load packages library(dplyr) @@ -12,11 +12,10 @@ library(stringr) library(purrr) library(arrow) library(data.table) +library(ggplot2) fs::dir_create("figures") -#### - speciestherm<-read_parquet("results/species_tsummaries.parquet") speciesthermdt<-data.table(speciestherm) @@ -30,28 +29,18 @@ speciesthermsite<-read_parquet("results/tsummaries_aggregated.parquet") speciesthermsitedt<-data.table(speciesthermsite) -### CTI by higher geography areas ##### - -names(speciesthermsitedt) -# [1] "species" "AphiaID" "source_obis" "source_gbif" "source_dna" -# [6] "higherGeography" "q_0.5" "q_0.75" "q_0.9" "q_0.95" -# [11] "q_0.99" "q_1" "mean" "sd" "q_0" -# [16] "q_0.01" "q_0.05" "q_0.1" "q_0.25" "site_current" -# [21] "site_ssp126_dec100" "site_ssp126_dec50" "site_ssp245_dec100" "site_ssp245_dec50" "site_ssp370_dec100" -# [26] "site_ssp370_dec50" "site_ssp460_dec100" "site_ssp460_dec50" "site_ssp585_dec100" "site_ssp585_dec50" -# [31] "records" "where" +speciesthermsitedt$where <- ifelse(speciesthermsitedt$where == "Both" | + speciesthermsitedt$where == "OBIS/GBIF", "Databases", "eDNA") +### CTI by higher geography areas ##### ctihighergeog<-speciesthermsitedt[,list(ctiavg=mean(q_0.5), sdcti=sd(q_0.5), str=mean(q_0.9-q_0.1), sstavg=mean(site_current), nspp=.N),by=c("higherGeography","where")] -# Save for later use -write_parquet(ctihighergeog, "results/cti_highergeo.parquet") # Save figures -pdf("figures/AllSpeciesThermalMetrics.pdf",height=3.5,width=8) par(mfrow=c(1,3)) plot(data=ctihighergeog,ctiavg~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5,main="All species", xlab="Sea surface temperature (°C)", ylab="Community Temperature Index (°C)") @@ -68,6 +57,53 @@ plot(data=ctihighergeog,str~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5, #abline(a=0,b=1,lty=3) legend("bottomleft",legend=levels(as.factor(ctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + +(p1 <- ggplot(ctihighergeog) + + geom_abline(slope = 1, intercept = 0, color = "grey90") + + geom_point(aes(x = sstavg, y = ctiavg, colour = where, size = nspp), alpha = .5) + + scale_color_manual(values = c("#00c3c9", "#f58000")) + #"#5151db", + scale_y_continuous(limits = c(5, 30)) + + scale_x_continuous(limits = c(5, 30)) + + scale_size(range = c(2,7)) + + guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3))) + + ylab("Community Thermal Index (°C)") + + xlab("Sea Surface Temperature (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + +(p2 <- ggplot(ctihighergeog) + + geom_point(aes(x = sstavg, y = sdcti, colour = where, size = nspp), alpha = .5) + + scale_color_manual(values = c("#00c3c9", "#f58000")) + #"#5151db", + # scale_y_continuous(limits = c(5, 30)) + + # scale_x_continuous(limits = c(5, 30)) + + scale_size(range = c(2,7)) + + guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3))) + + ylab("SD CTI (°C)") + + xlab("Sea Surface Temperature (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + +(p3 <- ggplot(ctihighergeog) + + geom_point(aes(x = sstavg, y = str, colour = where, size = nspp), alpha = .5) + + scale_color_manual(values = c("#00c3c9", "#f58000")) + #"#5151db", + # scale_y_continuous(limits = c(5, 30)) + + # scale_x_continuous(limits = c(5, 30)) + + scale_size(range = c(2,7)) + + guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3))) + + ylab("Species thermal range (average, °C)") + + xlab("Sea Surface Temperature (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + +library(patchwork) +p1 + p2 + p3 + patchwork::plot_layout(guides = "collect") + ### Add taxon information specieslist<-read_parquet("results/species_list.parquet") diff --git a/codes/get_sst_marinespecies.R b/codes/get_sst_marinespecies.R index 39ba517..8edf7b1 100644 --- a/codes/get_sst_marinespecies.R +++ b/codes/get_sst_marinespecies.R @@ -8,10 +8,13 @@ # Load packages ---- library(DBI) +library(furrr) +library(storr) library(dplyr) library(tidyr) library(arrow) library(terra) +library(polars) fs::dir_create("results") # Load files/settings ---- @@ -65,15 +68,66 @@ names(env) <- env_vars # Get H3 ids from the environmental layer biooracle_h3 <- open_dataset("data/biooracle_h3id.parquet") -bio_h3_sel <- biooracle_h3 %>% - filter(h3 %in% unique(db_species$h3)) %>% +# Get unique h3 ids +un_h3s <- unique(db_species$h3) + +# Get cells for those available +bio_h3_presel <- biooracle_h3 %>% + filter(h3 %in% un_h3s) %>% select(cell, h3) %>% collect() +# Get not available +not_available <- un_h3s[!un_h3s %in% unique(bio_h3_presel$h3)] + +# Get disks +# Create a temporary folder to hold disks +fs::dir_create("t_disks") + +plan(multisession, workers = 8) +not_available_h3_b <- split(not_available, as.integer((seq_along(not_available) - 1) / 2000)) + +un_h3s_disk <- future_map(not_available_h3_b, function(x){ + + bio_h3 <- open_dataset("data/biooracle_h3id.parquet") + + disk_cells <- lapply(x, function(z){ + dc <- unlist(h3jsr::get_disk(z)) + data.frame(h3_original = z, h3 = dc) + }) + disk_cells <- bind_rows(disk_cells) + + pc <- bio_h3 %>% + filter(h3 %in% unique(disk_cells$h3)) %>% + select(cell, h3) %>% + collect() + + disk_cells <- left_join(disk_cells, pc) %>% + filter(!is.na(cell)) + + write_parquet(disk_cells, + paste0("t_disks/", x[1], ".parquet")) + NULL +}, .progress = T) + +# Aggregate expanded disks data +disk_f <- open_dataset("t_disks/") +disk_f <- disk_f %>% + collect() + +# Merge with previous one +bio_h3_presel <- bio_h3_presel %>% + mutate(h3_original = h3) %>% + select(h3_original, h3, cell) + +bio_h3_sel <- bind_rows(disk_f, bio_h3_presel) + + # Extract env data env_ext <- terra::extract(env, y = as.vector(bio_h3_sel$cell)) %>% bind_cols(bio_h3_sel) %>% - select(-cell) %>% + select(-cell, -h3) %>% + rename(h3 = h3_original) %>% group_by(h3) %>% summarise(across(everything(), ~ mean(.x, na.rm = TRUE))) diff --git a/codes/models_preview.pdf b/codes/models_preview.pdf new file mode 100644 index 0000000..481ada7 Binary files /dev/null and b/codes/models_preview.pdf differ diff --git a/codes/models_preview.qmd b/codes/models_preview.qmd new file mode 100644 index 0000000..302eaec --- /dev/null +++ b/codes/models_preview.qmd @@ -0,0 +1,376 @@ +--- +title: "eDNA climate analysis - models" +format: + pdf: + toc: true +--- + +This document contain the preliminary models for the eDNA climate analysis. + +At the end of the document there is a section called **"Thermal limits: extracted X experimental"** where I compare our thermal limits with those obtained through experiments for a subset of species. + +## Percentage of affected species + +Question: considering a scenario of stability in species composition, how many species of each site will be affected by climate change, i.e. experience temperatures above their upper limits? + +Hypothesis: the proportion of affected species will increase in all scenarios, being higher on the worst case scenarios. That trend should be apparent in all sites, but certain sites will be more affected than others. Both databases and eDNA will capture that trend. + +```{r message=FALSE, warning=FALSE} +library(arrow) +library(data.table) +library(lme4) +``` + + +```{r} + +# Read species summaries +speciesthermsite <- read_parquet("../results/tsummaries_aggregated.parquet") + +# Convert to DT +speciesthermsitedt <- data.table(speciesthermsite) +speciesthermsitedt$where <- ifelse(speciesthermsitedt$where == "Both" | + speciesthermsitedt$where == "OBIS/GBIF", "Databases", "eDNA") + +# Produce summaries +sthighergeog <- speciesthermsitedt[,list(ctiavg=mean(q_0.5), + sdcti=sd(q_0.5), + str=mean(q_0.9-q_0.1), + sstavg=mean(site_current), + nspp=.N, + pgt90now=sum(site_current>q_0.9)/.N, + pgt90ssp1d50=sum(site_ssp126_dec50>q_0.9)/.N, + pgt90ssp1d100=sum(site_ssp126_dec100>q_0.9)/.N, + pgt90ssp2d50=sum(site_ssp245_dec50>q_0.9)/.N, + pgt90ssp2d100=sum(site_ssp245_dec100>q_0.9)/.N, + pgt90ssp3d50=sum(site_ssp370_dec50>q_0.9)/.N, + pgt90ssp3d100=sum(site_ssp370_dec100>q_0.9)/.N, + pgt90ssp4d50=sum(site_ssp460_dec50>q_0.9)/.N, + pgt90ssp4d100=sum(site_ssp460_dec100>q_0.9)/.N, + pgt90ssp5d50=sum(site_ssp585_dec50>q_0.9)/.N, + pgt90ssp5d100=sum(site_ssp585_dec100>q_0.9)/.N), + by=c("higherGeography","where")] + +head(sthighergeog) +``` + +Once we prepare the data we can make the models. We start by doing a simple linear regression. + +```{r} +sthighergeog$where <- as.factor(sthighergeog$where) +sthighergeog$higherGeography <- as.factor(sthighergeog$higherGeography) + +sthighergeog_l <- data.table(tidyr::pivot_longer(sthighergeog, + cols = 8:18, + names_to = "scenario", values_to = "paf")) + +sthighergeog_l50 <- sthighergeog_l[grepl("d50|now", scenario),,] +sthighergeog_l100 <- sthighergeog_l[grepl("d100|now", scenario),,] + +sthighergeog_l50$scenario <- as.factor(sthighergeog_l50$scenario) +sthighergeog_l100$scenario <- as.factor(sthighergeog_l100$scenario) + +boxplot(paf ~ scenario, data = sthighergeog_l50) + +paf_lm1 <- glm(paf ~ scenario, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm1) +``` + +Check the residuals: + +```{r fig.height=6} +par(mfrow = c(2,2)) +plot(paf_lm1) +``` + +QQ plot is not good, but the rest is ok. + +Model considering the source of the data: + +```{r} +paf_lm2 <- glm(paf ~ scenario + where, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm2) +``` + +Check the residuals: + +```{r fig.height=6} +par(mfrow = c(2,2)) +plot(paf_lm2) +``` + +Try to include the interaction between "where" and "scenario": + +```{r} +paf_lm3 <- glm(paf ~ scenario*where, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm3) +``` + +Check residuals: + +```{r fig.height=6} +par(mfrow = c(2,2)) +plot(paf_lm3) +``` + +Try a Linear Mixed-Effects model. In this case, we will focus in modelling PAF ~ scenario, with "where" as a random effect. + +```{r} +paf_lme1 <- glmer(paf ~ scenario + (1|where), data = sthighergeog_l50, family = binomial(), + weights = nspp) + +summary(paf_lme1) +``` + +The source does explain some of the variation, but not a great amount compared to the scenarios. + +```{r} +par(mfrow = c(1,2)) +plot(paf_lme1) +qqnorm(resid(paf_lme1)) +qqline(resid(paf_lme1)) +``` + +Try now a model with higherGeography as a random effect: + +```{r} +paf_lme2 <- glmer(paf ~ scenario + (1|higherGeography), data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lme2) +``` + +In this case, the higherGeography have more influence on the values of PAF. This seems to be an interesting model to proceed with. + +Check residuals: + +```{r} +par(mfrow = c(1,2)) +plot(paf_lme2) +qqnorm(resid(paf_lme2)) +qqline(resid(paf_lme2)) +``` + +Both models show the trend of increase in affected species as we go to the worst case scenarios. + +Try to include the temperature trend on the model: + +```{r} +paf_lm_sst1 <- glm(paf ~ scenario + where + sstavg, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm_sst1) + +sthighergeog_l50_sc <- sthighergeog_l50 +sthighergeog_l50_sc$sstavg <- scale(sthighergeog_l50_sc$sstavg) + +paf_lme_sst1 <- glmer(paf ~ scenario + sstavg + (1|where), data = sthighergeog_l50_sc, family = binomial(), weights = nspp) + +summary(paf_lme_sst1) + +paf_gam_sst1 <- mgcv::gam(paf ~ scenario + where + s(sstavg, k = 4), data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_gam_sst1) +``` + + +### Removing species close to the current temperature limits (tropical) + +```{r} +sst <- terra::rast("~/Research/mpa_europe/mpaeu_sdm/data/env/current/thetao_baseline_depthsurf_mean.tif") +sst <- as.data.frame(sst) + +plot(density(sst$thetao_mean)) + +sst_q <- quantile(sst$thetao_mean, 0.9) # We can try other values + +speciesthermsitedt <- speciesthermsitedt[q_1 <= sst_q,,] + +sthighergeog <- speciesthermsitedt[,list(ctiavg=mean(q_0.5), + sdcti=sd(q_0.5), + str=mean(q_0.9-q_0.1), + sstavg=mean(site_current), + nspp=.N, + pgt90now=sum(site_current>q_0.9)/.N, + pgt90ssp1d50=sum(site_ssp126_dec50>q_0.9)/.N, + pgt90ssp1d100=sum(site_ssp126_dec100>q_0.9)/.N, + pgt90ssp2d50=sum(site_ssp245_dec50>q_0.9)/.N, + pgt90ssp2d100=sum(site_ssp245_dec100>q_0.9)/.N, + pgt90ssp3d50=sum(site_ssp370_dec50>q_0.9)/.N, + pgt90ssp3d100=sum(site_ssp370_dec100>q_0.9)/.N, + pgt90ssp4d50=sum(site_ssp460_dec50>q_0.9)/.N, + pgt90ssp4d100=sum(site_ssp460_dec100>q_0.9)/.N, + pgt90ssp5d50=sum(site_ssp585_dec50>q_0.9)/.N, + pgt90ssp5d100=sum(site_ssp585_dec100>q_0.9)/.N), + by=c("higherGeography","where")] + +sthighergeog$where <- as.factor(sthighergeog$where) +sthighergeog$higherGeography <- as.factor(sthighergeog$higherGeography) + +sthighergeog_l <- data.table(tidyr::pivot_longer(sthighergeog, + cols = 8:18, + names_to = "scenario", values_to = "paf")) + +sthighergeog_l50 <- sthighergeog_l[grepl("d50|now", scenario),,] +sthighergeog_l100 <- sthighergeog_l[grepl("d100|now", scenario),,] + +sthighergeog_l50$scenario <- as.factor(sthighergeog_l50$scenario) +sthighergeog_l100$scenario <- as.factor(sthighergeog_l100$scenario) + +boxplot(paf ~ scenario, data = sthighergeog_l50) +``` + +Try again the same models: + +```{r} +paf_lm1_rs <- glm(paf ~ scenario, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm1_rs) +``` + +Check the residuals: + +```{r fig.height=6} +par(mfrow = c(2,2)) +plot(paf_lm1_rs) +``` + +QQ plot is not good, but the rest is ok. + +Model considering the source of the data: + +```{r} +paf_lm2_rs <- glm(paf ~ scenario + where, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm2_rs) +``` + +Check the residuals: + +```{r fig.height=6} +par(mfrow = c(2,2)) +plot(paf_lm2_rs) +``` + +Try to include the interaction between "where" and "scenario": + +```{r} +paf_lm3_rs <- glm(paf ~ scenario*where, data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lm3_rs) +``` + +Check residuals: + +```{r fig.height=6} +par(mfrow = c(2,2)) +plot(paf_lm3_rs) +``` + +Try a Linear Mixed-Effects model. In this case, we will focus in modelling PAF ~ scenario, with "where" as a random effect. + +```{r} +paf_lme1_rs <- glmer(paf ~ scenario + (1|where), data = sthighergeog_l50, family = binomial(), + weights = nspp) + +summary(paf_lme1_rs) +``` + +The source does explain some of the variation, but not a great amount compared to the scenarios. + +```{r} +par(mfrow = c(1,2)) +plot(paf_lme1_rs) +qqnorm(resid(paf_lme1_rs)) +qqline(resid(paf_lme1_rs)) +``` + +Try now a model with higherGeography as a random effect: + +```{r} +paf_lme2_rs <- glmer(paf ~ scenario + (1|higherGeography), data = sthighergeog_l50, family = binomial(), weights = nspp) + +summary(paf_lme2_rs) +``` + +The higherGeography have even more influence on the values of PAF. + +Check residuals: + +```{r} +par(mfrow = c(1,2)) +plot(paf_lme2_rs) +qqnorm(resid(paf_lme2_rs)) +qqline(resid(paf_lme2_rs)) +``` + + +## Thermal limits: extracted X experimental + +I obtained the thermal limits from the [GlobTherm dataset](https://datadryad.org/stash/dataset/doi:10.5061/dryad.1cv08). + +```{r} +suppressPackageStartupMessages(library(dplyr)) + +# Get only unique species - because we will use only the thermal limits, we can get from the speciesthermsitedt table +species_filt <- speciesthermsitedt[!duplicated(speciesthermsitedt$species),] + +# Load the GlobTherm dataset +globtherm <- suppressMessages(suppressWarnings(readxl::read_xlsx("~/Downloads/GlobalTherm_upload_10_11_17.xlsx"))) +globtherm_sp <- paste(globtherm$Genus, globtherm$Species) + +# See which species are available +gt_sel <- which(globtherm_sp %in% species_filt$species) +gt_sel <- globtherm[gt_sel,] +gt_sel <- gt_sel %>% + mutate(species = globtherm_sp[globtherm_sp %in% species_filt$species]) %>% + select(species, Tmax, max_metric) + +# See which metrics are available +table(gt_sel$max_metric) + +# Select only those that are on our list +species_filt_sel <- species_filt[species_filt$species %in% gt_sel$species,] +species_filt_sel <- species_filt_sel %>% + select(species, q_0.9) + +# Join both tables +tlimits <- species_filt_sel %>% + left_join(gt_sel) %>% + filter(!is.na(Tmax)) + +nrow(tlimits) + +# Plot +plot(tlimits$q_0.9 ~ tlimits$Tmax, xlim = c(0, 40), ylim = c(0,40)) +abline(a = 0, b = 1) + +m1 <- lm(q_0.9~Tmax, data = tlimits) +abline(m1, col = "red") +``` + +We see that for a great portion of the species, the species thermal maxima is actually much higher than the one we estimated through the occurrence data. + +It is also possible to use only "CTmax": + +```{r} + +gt_sel_ctmax <- gt_sel %>% + filter(max_metric == "ctmax") + +# Join both tables +tlimits_ctmax <- species_filt_sel %>% + left_join(gt_sel_ctmax) %>% + filter(!is.na(Tmax)) + +nrow(tlimits_ctmax) + +# Plot +plot(tlimits_ctmax$q_0.9 ~ tlimits_ctmax$Tmax, xlim = c(0, 40), ylim = c(0,40)) +abline(a = 0, b = 1) + +m2 <- lm(q_0.9~Tmax, data = tlimits_ctmax) +abline(m1, col = "red") +``` + diff --git a/codes/plot_analysis_occurrence_data.R b/codes/plot_analysis_occurrence_data.R new file mode 100644 index 0000000..1414a41 --- /dev/null +++ b/codes/plot_analysis_occurrence_data.R @@ -0,0 +1,256 @@ +#################### eDNA expeditions - scientific analysis #################### +########################## Environmental data download ######################### +# January of 2024 +# Author: Silas Principe, Mike Burrows +# Contact: s.principe@unesco.org; michael.burrows@sams.ac.uk +# +############## Plot eDNA data from UNESCO World Heritage Sites ################# + +# Load packages +library(dplyr) +library(stringr) +library(purrr) +library(arrow) +library(data.table) +library(ggplot2) +fs::dir_create("figures") + + +speciestherm<-read_parquet("results/species_tsummaries.parquet") + +speciesthermdt<-data.table(speciestherm) + +speciesthermsst<-speciesthermdt[depth=="depthsurf" & variant=="mean",,] +speciesthermsstc<-dcast(speciesthermdt,species~metric,value.var = "value",fun=mean) + +## Site by species occurrence table + +speciesthermsite<-read_parquet("results/tsummaries_aggregated.parquet") + +speciesthermsitedt<-data.table(speciesthermsite) + +speciesthermsitedt$where <- ifelse(speciesthermsitedt$where == "Both" | + speciesthermsitedt$where == "OBIS/GBIF", "Databases", "eDNA") + +### CTI by higher geography areas ##### +ctihighergeog<-speciesthermsitedt[,list(ctiavg=mean(q_0.5), + sdcti=sd(q_0.5), + str=mean(q_0.9-q_0.1), + sstavg=mean(site_current), + nspp=.N),by=c("higherGeography","where")] + + +# Save figures +par(mfrow=c(1,3)) +plot(data=ctihighergeog,ctiavg~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5,main="All species", + xlab="Sea surface temperature (°C)", ylab="Community Temperature Index (°C)") +abline(a=0,b=1,lty=3) +legend("topleft",legend=levels(as.factor(ctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + +plot(data=ctihighergeog,sdcti~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5,main="All species", + xlab="Sea surface temperature (°C)", ylab="SD CTI (°C)") +#abline(a=0,b=1,lty=3) +legend("bottomleft",legend=levels(as.factor(ctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + +plot(data=ctihighergeog,str~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5,main="All species", + xlab="Sea surface temperature (°C)", ylab="Species thermal range (average, °C)") +#abline(a=0,b=1,lty=3) +legend("bottomleft",legend=levels(as.factor(ctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + + +(p1 <- ggplot(ctihighergeog) + + geom_abline(slope = 1, intercept = 0, color = "grey90") + + geom_point(aes(x = sstavg, y = ctiavg, colour = where, size = nspp), alpha = .5) + + scale_color_manual(values = c("#00c3c9", "#f58000")) + #"#5151db", + scale_y_continuous(limits = c(5, 30)) + + scale_x_continuous(limits = c(5, 30)) + + scale_size(range = c(2,7)) + + guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3))) + + ylab("Community Thermal Index (°C)") + + xlab("Sea Surface Temperature (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + +(p2 <- ggplot(ctihighergeog) + + geom_point(aes(x = sstavg, y = sdcti, colour = where, size = nspp), alpha = .5) + + scale_color_manual(values = c("#00c3c9", "#f58000")) + #"#5151db", + # scale_y_continuous(limits = c(5, 30)) + + # scale_x_continuous(limits = c(5, 30)) + + scale_size(range = c(2,7)) + + guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3))) + + ylab("SD CTI (°C)") + + xlab("Sea Surface Temperature (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + +(p3 <- ggplot(ctihighergeog) + + geom_point(aes(x = sstavg, y = str, colour = where, size = nspp), alpha = .5) + + scale_color_manual(values = c("#00c3c9", "#f58000")) + #"#5151db", + # scale_y_continuous(limits = c(5, 30)) + + # scale_x_continuous(limits = c(5, 30)) + + scale_size(range = c(2,7)) + + guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3))) + + ylab("Species thermal range (average, °C)") + + xlab("Sea Surface Temperature (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + +library(patchwork) +p1 + p2 + p3 + patchwork::plot_layout(guides = "collect") + plot_annotation(tag_levels = 'A') + +ggsave("figures/thermal_comp_allsp.png", width = 14, height = 5) + + +### ALTERNATIVE PLOT +for (i in 1:100) { + + resampled <- speciesthermsitedt[, list( + q_0.5 = sample(q_0.5, .N, replace = T), + q_0.9 = sample(q_0.9, .N, replace = T), + q_0.1 = sample(q_0.1, .N, replace = T), + site_current = sample(site_current, .N, replace = T) + #resample = 1:.N#sample(1:.N, .N, replace = T) + ), by = c("higherGeography", "where")] + + ctihighergeog_temp <- resampled[, list( + ctiavg = mean(q_0.5), + sdcti = sd(q_0.5), + str = mean(q_0.9 - q_0.1), + sstavg = mean(site_current), + nspp = .N + ), by = c("higherGeography", "where")] + + if (i == 1) { + ctihighergeog_boot <- ctihighergeog_temp + } else { + ctihighergeog_boot <- rbind(ctihighergeog_boot, ctihighergeog_temp) + } + +} + + + +### Bootstrap version +ctihighergeog$higherGeography <- substr(ctihighergeog$higherGeography, 1, 10) +ctihighergeog$higherGeography <- factor(ctihighergeog$higherGeography, + levels = unique(ctihighergeog$higherGeography[order(ctihighergeog$sstavg)])) + +ctihighergeog_boot$higherGeography <- substr(ctihighergeog_boot$higherGeography, 1, 10) +ctihighergeog_boot$higherGeography <- factor(ctihighergeog_boot$higherGeography, + levels = levels(ctihighergeog$higherGeography)) + +ctihighergeog_boot_sum <- ctihighergeog_boot[, list( + ctiavg_low = quantile(ctiavg, 0.25), + ctiavg_median = median(ctiavg), + ctiavg_high = quantile(ctiavg, 0.75), + sdcti_low = quantile(sdcti, 0.25), + sdcti_median = median(sdcti), + sdcti_high = quantile(sdcti, 0.75), + str_low = quantile(str, 0.25), + str_median = median(str), + str_high = quantile(str, 0.75), + nspp = mean(nspp) +), by = c("higherGeography", "where")] + +ctihighergeog_sst <- ctihighergeog_boot[,list( + sstavg = mean(sstavg) +), by = c("higherGeography", "where")] +ctihighergeog_sst$what = "Average site SST" + +(p1_boot <- ggplot() + + geom_point(data = ctihighergeog_sst, aes(x = higherGeography, y = sstavg, fill = what), + shape = 15, size = 4, alpha = .1) + + geom_jitter(data = ctihighergeog_boot, aes(x = higherGeography, y = ctiavg, color = where), + shape = 16, size = 1, alpha = .05, + position = position_jitterdodge(dodge.width = 0.5, + jitter.height = 0, jitter.width = 0.1)) + + geom_linerange(data = ctihighergeog_boot_sum, + aes(x = higherGeography, ymin = ctiavg_low, ymax = ctiavg_high, + color = where), linewidth = 1, + position = position_dodge(width = 0.5)) + + geom_point(data = ctihighergeog, aes(x = higherGeography, y = ctiavg, color = where, size = nspp), + shape = 16, position = position_dodge(width = 0.5), alpha = .6) + + scale_color_manual(values = c("#00c3c9", "#f58000"), guide = "none") + #"#5151db", + scale_fill_manual(values = c("grey70")) + + scale_size(range = c(2,7), guide = "none") + + #guides(fill = guide_legend(override.aes = list(alpha = 1))) + + ylab("Temperature (°C)") + + xlab(NULL) + + theme_light() + + coord_flip() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank())) + + + +p1_boot + p2 + p3 + patchwork::plot_layout(guides = "collect") + plot_annotation(tag_levels = 'A') + +ggsave("figures/test_newcti_boot.png", width = 14, height = 5) + + + + + + + + + +### Add taxon information + +specieslist<-read_parquet("results/species_list.parquet") + +speciesthermsitedt1<-merge(speciesthermsitedt,specieslist,by="species") + +fishthermsitedt<-speciesthermsitedt1[group=="fish",,] + +### CTI by fish only ##### + + +fishctihighergeog<-fishthermsitedt[,list(ctiavg=mean(q_0.5), + sdcti=sd(q_0.5), + str=mean(q_0.9-q_0.1), + sstavg=mean(site_current), + pgt90now=sum(site_current>q_0.9)/.N, + pgt90ssp2=sum(site_ssp245_dec50>q_0.9)/.N, + nspp=.N),by=c("higherGeography","where")] + +par(mfrow=c(1,3)) +plot(data=fishctihighergeog,ctiavg~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5, + xlab="Sea surface temperature (°C)", ylab="Community Temperature Index (°C)",main="Fish") +abline(a=0,b=1,lty=3) +legend("topleft",legend=levels(as.factor(fishctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + +plot(data=fishctihighergeog,sdcti~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5, + xlab="Sea surface temperature (°C)", ylab="SD CTI (°C)",main="Fish") +#abline(a=0,b=1,lty=3) +legend("bottomleft",legend=levels(as.factor(fishctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + +plot(data=fishctihighergeog,str~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5, + xlab="Sea surface temperature (°C)", ylab="Species thermal range (average, °C)",main="Fish") +#abline(a=0,b=1,lty=3) +legend("bottomleft",legend=levels(as.factor(fishctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) +dev.off() + +### Proportion of species living above their T90 temperatures + +par(mfrow=c(1,2)) +plot(data=fishctihighergeog,pgt90now~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5,ylim=c(0,1), + xlab="Sea surface temperature (°C)", ylab="Proportion of species with SST>T90",main="Fish") +abline(a=0,b=1,lty=3) +legend("topleft",legend=levels(as.factor(fishctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + +plot(data=fishctihighergeog,pgt90ssp2~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5,ylim=c(0,1), + xlab="Sea surface temperature (°C)", ylab="Proportion of species with SST>T90 under SSP2",main="Fish") +abline(a=0,b=1,lty=3) +legend("topleft",legend=levels(as.factor(fishctihighergeog$where)),inset=0.02,col=1:3,pt.cex=1.5,pch=16) + + + + diff --git a/codes/plot_paf.R b/codes/plot_paf.R new file mode 100644 index 0000000..aef7e3e --- /dev/null +++ b/codes/plot_paf.R @@ -0,0 +1,225 @@ +#################### eDNA expeditions - scientific analysis #################### +########################## Environmental data download ######################### +# January of 2024 +# Author: Silas Principe, Mike Burrows +# Contact: s.principe@unesco.org; michael.burrows@sams.ac.uk +# +############## Plot eDNA data from UNESCO World Heritage Sites ################# + +# Load packages ---- +library(dplyr) +library(terra) +library(tidyr) +library(stringr) +library(purrr) +library(arrow) +library(data.table) +library(patchwork) +library(ggplot2) +library(fishualize) +library(ggdist) +library(see) +fs::dir_create("figures") + +# Load data ---- +# Load functional traits data +funcdiv <- read.csv("data/funcdiversity.csv", sep = "\t") +table(funcdiv$Habitat) +summary(funcdiv$DepthRangeDeep) + +# Select shallow water species +summary(funcdiv$DepthRangeDeep) # There are 1348 species with NA values + +# Limit to those with maximum depth 200m +sel_species <- funcdiv %>% + filter(!is.na(DepthRangeDeep)) %>% + filter(DepthRangeDeep <= 200) + + + +# Load temperature summaries by site +speciesthermsite <- read_parquet("results/tsummaries_aggregated.parquet") + +# Process the data ---- +# Convert to data.table +speciesthermsitedt <- data.table(speciesthermsite) + +# Change `where` column IDs +speciesthermsitedt$where <- ifelse( + speciesthermsitedt$where == "Both" | speciesthermsitedt$where == "OBIS/GBIF", + "Databases", + "eDNA" +) + +# Load species list +specieslist <- read_parquet("results/species_list.parquet") + +# Merge group information +speciesthermsitedt_gp <- merge(speciesthermsitedt, specieslist[,c("species", "group")], by = "species") + +# Filter for fish only +fishthermsitedt <- speciesthermsitedt_gp[group == "fish", , ] + +# Convert to longer format +fishthermsitedt_long <- fishthermsitedt %>% + filter(species %in% unique(sel_species$X)) %>% # Remove this line to not filter by the traits + pivot_longer(cols = starts_with("site"), names_to = "scenario", + values_to = "sst") %>% + filter(scenario %in% c("site_current", "site_ssp126_dec50", "site_ssp245_dec50", + "site_ssp370_dec50", "site_ssp585_dec50")) + +# Create a function to see at which scenario the species become at risk +scenario_order <- c("site_current", "site_ssp126_dec50", + "site_ssp585_dec50") +get_class <- function(x) { + if(all(x < 0)) { + return("not_risk") + } else { + return(scenario_order[min(which(x > 0))]) + } +} + +# Retrieve sites names and latitudes +sites <- vect("data/shapefiles/marine_world_heritage.gpkg") +sites$latitude <- unlist(sapply(1:length(sites), function(x){ + unname(terra::geom(centroids(sites[x,]))[,4]) +})) +sites_org <- data.frame(sites = sites$name, latitude = sites$latitude) + +# Change names of sites +sites_org <- sites_org %>% + rename(higherGeography = sites) %>% + mutate(higherGeography = case_when( + higherGeography == "Wadden Sea" ~ "Wadden Sea", + higherGeography == "Archipiélago de Revillagigedo" ~ "Revillagigedo", + higherGeography == "iSimangaliso Wetland Park" ~ "iSimangaliso", + higherGeography == "Ningaloo Coast" ~ "Ningaloo coast", + higherGeography == "Socotra Archipelago" ~ "Socotra Archipelago", + higherGeography == "Belize Barrier Reef Reserve System" ~ "Belize Barrier Reef", + higherGeography == "Cocos Island National Park" ~ "Cocos Island", + higherGeography == "Lord Howe Island Group" ~ "Lord Howe Island", + higherGeography == "Sundarbans National Park" ~ "Sundarbans", + higherGeography == "Península Valdés" ~ "Peninsula Valdes", + higherGeography == "Gulf of Porto: Calanche of Piana, Gulf of Girolata, Scandola Reserve" ~ "Scandola", + higherGeography == "Coiba National Park and its Special Zone of Marine Protection" ~ "Coiba", + higherGeography == "Lagoons of New Caledonia: Reef Diversity and Associated Ecosystems" ~ "Lagoons of New Caledonia", + higherGeography == "Shark Bay, Western Australia" ~ "Shark Bay", + higherGeography == "Tubbataha Reefs Natural Park" ~ "Tubbataha Reefs", + higherGeography == "Brazilian Atlantic Islands: Fernando de Noronha and Atol das Rocas Reserves" ~ "Fernando de Noronha", + higherGeography == "Everglades National Park" ~ "Everglades", + higherGeography == "Aldabra Atoll" ~ "Aldabra Atoll", + higherGeography == "Banc d'Arguin National Park" ~ "Banc d'Arguin", + higherGeography == "French Austral Lands and Seas" ~ "French Austral Lands", + )) %>% + filter(!is.na(higherGeography)) %>% + arrange(desc(latitude)) %>% + distinct(higherGeography, .keep_all = T) + +# Update names on the longer object +fishthermsitedt_long <- fishthermsitedt_long %>% + mutate(higherGeography = case_when( + higherGeography == "wadden sea" ~ "Wadden Sea", + higherGeography == "archipielago de revillagigedo" ~ "Revillagigedo", + higherGeography == "isimangaliso wetland park" ~ "iSimangaliso", + higherGeography == "ningaloo coast" ~ "Ningaloo coast", + higherGeography == "socotra archipelago" ~ "Socotra Archipelago", + higherGeography == "belize barrier reef reserve system" ~ "Belize Barrier Reef", + higherGeography == "cocos island national park" ~ "Cocos Island", + higherGeography == "lord howe island group" ~ "Lord Howe Island", + higherGeography == "the sundarbans" ~ "Sundarbans", + higherGeography == "peninsula valdes" ~ "Peninsula Valdes", + higherGeography == "gulf of porto calanche of piana gulf of girolata scandola reserve" ~ "Scandola", + higherGeography == "coiba national park and its special zone of marine protection" ~ "Coiba", + higherGeography == "lagoons of new caledonia reef diversity and associated ecosystems" ~ "Lagoons of New Caledonia", + higherGeography == "shark bay western australia" ~ "Shark Bay", + higherGeography == "tubbataha reefs natural park" ~ "Tubbataha Reefs", + higherGeography == "brazilian atlantic islands fernando de noronha and atol das rocas reserves" ~ "Fernando de Noronha", + higherGeography == "everglades national park" ~ "Everglades", + higherGeography == "aldabra atoll" ~ "Aldabra Atoll", + higherGeography == "banc d arguin national park" ~ "Banc d'Arguin", + higherGeography == "french austral lands and seas" ~ "French Austral Lands", + )) + +# Get the number of species in each site +sites_species <- fishthermsitedt_long %>% + group_by(higherGeography, species) %>% + summarise(n = n()) %>% + group_by(higherGeography) %>% + summarise(n = n()) + +# Join with the sites object +sites_org <- left_join(sites_org, sites_species) + +# Create the plot object with all information +plot_obj <- fishthermsitedt_long %>% + filter(scenario %in% c("site_current", "site_ssp126_dec50", "site_ssp585_dec50")) %>% + select(species, higherGeography, q_0.95, scenario, sst) %>% + mutate(differences = sst - q_0.95) %>% + group_by(higherGeography, species) %>% + mutate(q_0.95 = mean(q_0.95), point_class = get_class(differences)) %>% + mutate(higherGeography = factor(higherGeography, levels = rev(sites_org$higherGeography), + labels = rev(paste(sites_org$higherGeography, paste0("(", sites_org$n, ")"))))) +plot_obj$point_class <- factor(plot_obj$point_class, levels = c("not_risk", "site_current", "site_ssp126_dec50", "site_ssp585_dec50")) + +# Extract sites SST +sites_sst <- fishthermsitedt_long %>% + filter(scenario %in% c("site_current", "site_ssp126_dec50", "site_ssp585_dec50")) %>% + group_by(higherGeography, scenario) %>% + summarise(sst = mean(sst)) %>% + mutate(point_class = scenario) %>% + left_join(sites_org[,c("higherGeography", "n")]) %>% + mutate(higherGeography = paste(higherGeography, paste0("(", n, ")"))) +sites_sst$point_class <- factor(sites_sst$point_class, levels = c("not_risk", "site_current", "site_ssp126_dec50", "site_ssp585_dec50")) + + +# Make plots ---- +# Left plot - dots +plot_a <- plot_obj %>% + ggplot() + + geom_violinhalf(aes(x = higherGeography, y = q_0.95), + position = position_nudge(x = 0.3)) + + geom_jitter(aes(x = higherGeography, y = q_0.95, color = point_class), alpha = .3, + width = 0.12, shape = 16, show.legend = F) + + geom_linerange(data = sites_sst, aes(ymin = sst-0.1, ymax = sst+0.1, x = higherGeography, color = point_class), + linewidth = 7, show.legend = T) + + scale_color_manual(values = c("grey90", "#d4b9da", "#df65b0", "#980043"), + labels = c("Not at risk", "Current", "SSP1", "SSP5")) + + coord_flip() + + ylab("Temperature (°C)") + xlab(NULL) + + theme_light() + + theme(panel.border = element_blank(), + panel.grid.minor.x = element_blank(), + panel.grid.major.y = element_blank(), + legend.title = element_blank(), + legend.position = "bottom", + strip.background = element_blank(), + strip.text = element_text(color = "black"), + plot.margin = unit(c(0.1,0.1,0.1,0), "cm"));plot_a + +# Right plot - bars +plot_b <- plot_obj %>% + group_by(higherGeography, scenario) %>% + summarise(percentage = sum(differences > 0)/length(differences)) %>% + ggplot() + + geom_bar(aes(x = higherGeography, y = percentage, fill = scenario), stat = "identity", position="dodge", width = .6) + + scale_fill_manual(values = c("#d4b9da", "#df65b0", "#980043")) + + scale_y_continuous(labels = c("0%", "", "50%", "", "100%")) + + coord_flip() + + ylab("Potentially Affected Fraction") + xlab(NULL) + + theme_light() + + theme(panel.border = element_blank(), + panel.grid.minor.x = element_blank(), + panel.grid.major.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + legend.title = element_blank(), + legend.position = "none", + strip.background = element_blank(), + strip.text = element_text(color = "black"), + plot.margin = unit(c(0,0,0,0), "cm"));plot_b + +# Put together +plot_a + plot_b + plot_layout(widths = c(4, 1)) + +# Save +ggsave("figures/paf_fishes_filtered_v1.png", width = 12, height = 7) \ No newline at end of file diff --git a/codes/plot_temperatures_glossy.R b/codes/plot_temperatures_glossy.R new file mode 100644 index 0000000..37cc02a --- /dev/null +++ b/codes/plot_temperatures_glossy.R @@ -0,0 +1,136 @@ +# Plot SST maps with differences for glossy report + +# Load packages +library(terra) +library(ggplot2) +library(patchwork) + +# Set directory +sh <- "../../../mpa_europe/mpaeu_sdm/data/env/" + +# Load environmental layers +current <- rast(paste0(sh, "current/thetao_baseline_depthsurf_mean.tif")) +ssp1 <- rast(paste0(sh, "future/ssp126/thetao_ssp126_depthsurf_dec50_mean.tif")) +ssp5 <- rast(paste0(sh, "future/ssp585/thetao_ssp585_depthsurf_dec50_mean.tif")) + +# Aggregate for faster plotting +current <- aggregate(current, 2) +ssp1 <- aggregate(ssp1, 2) +ssp5 <- aggregate(ssp5, 2) + +# Get deltas +ssp1_d <- ssp1 - current +ssp5_d <- ssp5 - current + +# Convert to data.frame +current_df <- as.data.frame(current, xy = T) +ssp1_d_df <- as.data.frame(ssp1_d, xy = T) +ssp5_d_df <- as.data.frame(ssp5_d, xy = T) + +# Change column names +colnames(ssp1_d_df)[3] <- colnames(ssp5_d_df)[3] <- "delta" +colnames(current_df)[3] <- "val" + +# Verify range for breaks +range(ssp1_d_df$delta) +range(ssp5_d_df$delta) +# See how many are above 2.5, as difference is small +length(ssp5_d_df$delta[ssp5_d_df$delta > 2.5]) + +# Add extremes on one small cell to overcome ggplot limits +ssp1_d_df[1,3] <- -1.5 +ssp1_d_df[2,3] <- 2.5 +ssp5_d_df[1,3] <- -1.5 +ssp5_d_df[2,3] <- 2.5 + +# Change high values of SSP5 (i.e 2.53) to be within the scale +ssp5_d_df$delta[ssp5_d_df$delta > 2.5] <- 2.5 + +# Load world shapefile +world <- rnaturalearth::ne_countries(scale = 110, returnclass = "sf") + +# Make SST plot +a <- ggplot() + + geom_sf(data = world, color = "white", fill = "white") + + geom_contour_filled(data = current_df, aes(x = x, y = y, z = val)#, + #breaks = c(-1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2)#, + #color = "grey80", linewidth = 0.2 + ) + + scale_fill_brewer(palette = "BuPu", direction = 1, + guide = guide_coloursteps(show.limits = T), + name = "SST (°C)") + + coord_sf(expand = F, label_axes = list(bottom = "", right = "")) + + ggtitle("Current period") + + theme_light() + + theme(plot.background = element_blank(), + #panel.border = element_blank(), + axis.title = element_blank(), + legend.key.height = unit(0.03, "npc"), + legend.key.width = unit(0.1, "npc"), + legend.position = "bottom", + legend.title.position = "top", + legend.justification = "center", + legend.title = element_text(hjust = 0.5), + panel.grid = element_blank());a + +# Make SSP1 plot +b <- ggplot() + + geom_sf(data = world, color = "white", fill = "white") + + geom_contour_filled(data = ssp1_d_df, aes(x = x, y = y, z = delta), + breaks = c(-1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2, 2.5)#, + #color = "grey80", linewidth = 0.2 + ) + + scale_fill_manual(values = rev(c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#F7F7F7", "#D1E5F0", "#92C5DE", "#4393C3")), + guide = guide_coloursteps(show.limits = T), + name = "Difference in temperature (°C)") + + coord_sf(expand = F, + label_axes = list(bottom = "", right = "")) + + ggtitle("SSP1 (2050)") + + theme_light() + + theme(plot.background = element_blank(), + #panel.border = element_blank(), + axis.title = element_blank(), + legend.key.height = unit(0.03, "npc"), + legend.key.width = unit(0.1, "npc"), + legend.position = "bottom", + legend.title.position = "top", + legend.justification = "center", + legend.title = element_text(hjust = 0.5), + panel.grid = element_blank());b + +# Make SSP5 plot +c <- ggplot() + + geom_sf(data = world, color = "white", fill = "white") + + geom_contour_filled(data = ssp5_d_df, aes(x = x, y = y, z = delta), + breaks = c(-1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2, 2.5)#, + #color = "grey80", linewidth = 0.2 + ) + + scale_fill_manual(values = rev(c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#F7F7F7", "#D1E5F0", "#92C5DE", "#4393C3")), + guide = guide_coloursteps(show.limits = T), + name = "Difference in temperature (°C)") + + coord_sf(expand = F, + label_axes = list(bottom = "", right = "")) + + ggtitle("SSP5 (2050)") + + theme_light() + + theme(plot.background = element_blank(), + #panel.border = element_blank(), + axis.title = element_blank(), + legend.key.height = unit(0.03, "npc"), + legend.key.width = unit(0.1, "npc"), + legend.position = "bottom", + legend.title.position = "top", + legend.justification = "center", + legend.title = element_text(hjust = 0.5), + panel.grid = element_blank());c + +# Compose +pc <- ((plot_spacer() + a + plot_layout(guides = "keep")) | (b + c + plot_layout(guides = "collect"))) & theme(legend.position = "bottom", + legend.key.height = unit(0.025, "npc"), + legend.key.width = unit(0.04, "npc"), + legend.title = element_text(hjust = 0.5, size = 12), + legend.text = element_text(hjust = 0.5, size = 12)) + +# Save +ggsave(filename = "figures/sst_isomaps.png", plot = pc, width = 28, height = 8) + +# END \ No newline at end of file diff --git a/codes/sdm_components_mhs.R b/codes/sdm_components_mhs.R new file mode 100644 index 0000000..09d689f --- /dev/null +++ b/codes/sdm_components_mhs.R @@ -0,0 +1,443 @@ +############################## MPA Europe project ############################## +########### WP3 - Species and biogenic habitat distributions (UNESCO) ########## +# March of 2024 +# Authors: Silas C. Principe, Pieter Provoost +# Contact: s.principe@unesco.org +# +################## Components for main modelling function ###################### + +# Loading main data ---- +.cm_load_species <- function(species, species_dataset) { + # Load species data + if (tools::file_ext(species_dataset) == "parquet") { + if (utils::file_test("-d", species_dataset)) { + ds <- pl$scan_parquet(file.path(species_dataset, "**/*")) + } else { + ds <- pl$scan_parquet(species_dataset) + } + species_data <- ds$filter(pl$col("taxonID") == species)$collect() + species_data <- species_data$to_data_frame() + } else if (tools::file_ext(species_dataset) %in% c("sqlite", "db")) { + + con <- DBI::dbConnect(RSQLite::SQLite(), species_dataset) + + species_name <- worrms::wm_id2name(species) + + sel_species <- paste0("'", species_name, "'", collapse = ", ") + + db_query <- glue::glue(" + select species, h3, records + from occurrence + where species in ({sel_species}) + ") + + db_res <- DBI::dbSendQuery(con, db_query) + species_data <- DBI::dbFetch(db_res) + + DBI::dbDisconnect(con) + + species_data_coord <- h3jsr::cell_to_point(species_data$h3) + species_data_coord <- sf::st_coordinates(species_data_coord) + colnames(species_data_coord) <- c("decimalLongitude", "decimalLatitude") + + species_data <- dplyr::bind_cols(species_data, species_data_coord) + species_data$data_type <- "fit_points" + species_data$dataset_sel <- NA + species_data$taxonID <- species + + require(polars) + fao_areas <- pl$scan_parquet("data/fao_areas.parquet") + + sp_sbase <- try(rfishbase::species(list(species_name))) + + if (!inherits(sp_sbase, "try-error")) { + if (nrow(sp_sbase) < 1) { + sp_sbase <- try(rfishbase::species(list(species_name), + server = "sealifebase")) + } + } + + if (!inherits(sp_sbase, "try-error") & nrow(sp_sbase) > 0) { + fao_areas_sp <- fao_areas$filter(pl$col("SpecCode") == sp_sbase$SpecCode[1])$ + collect() + fao_areas_sp <- fao_areas_sp$to_data_frame() + + fao_areas_sp <- fao_areas_sp %>% + filter(Status %in% c("endemic", "Endemic", "native", "Native", "introduced", "Introduced")) + + if (nrow(fao_areas_sp) > 0) { + fao_shp <- terra::vect("data/shapefiles/World_Fao_Zones.shp") + + fao_shp_sel <- fao_shp[fao_shp$zone %in% fao_areas_sp$AreaCode] + + if (length(fao_shp_sel) > 0) { + sf::sf_use_s2(FALSE) + fao_shp_sel <- suppressMessages( + suppressWarnings( + sf::st_buffer(sf::st_make_valid(sf::st_as_sf(fao_shp_sel)), 0.2) + ) + ) + + is_onarea <- terra::is.related(terra::vect(species_data[,c("decimalLongitude", "decimalLatitude")], geom = c("decimalLongitude", "decimalLatitude")), + terra::vect(fao_shp_sel), + "intersects") + is_onarea <- ifelse(is_onarea, 1, 0) + + species_data$fao_confirmed <- is_onarea + } else { + species_data$fao_confirmed <- NA + } + + } else { + warning("Impossible to retrieve area information from FishBase/SeaLifeBase or no native areas") + } + } else { + warning("Impossible to retrieve species information from FishBase/SeaLifeBase") + } + + if (!"fao_confirmed" %in% colnames(species_data)) { + species_data$fao_confirmed <- NA + } + + + } else if (any(grepl("key", list.files(species_dataset, recursive = T)))) { + lf <- list.files(species_dataset, recursive = T) + lf <- lf[grepl(paste0("key=", species), lf)] + if (length(lf) > 0) { + species_data <- arrow::read_parquet(paste0(species_dataset, "/", lf)) + } else { + species_data <- matrix(nrow = 0, ncol = 1) + } + } else { + cli::cli_abort("No folder or files were found in {.path {species_dataset}}") + } + + return(species_data) +} + +# Check coastal ---- +.cm_check_coastal <- function(species_data, env, coord_names, verbose) { + + if ("coastal" %in% names(env$hypothesis)) { + # Test if is within europe/coastal + europe_starea <- vect("data/shapefiles/mpa_europe_starea_v2.shp") + europe_starea <- ext(europe_starea) + + is_within <- is.related(vect(species_data[,coord_names], geom = coord_names), + europe_starea, "intersects") + if (!all(is_within)) { + env$hypothesis <- env$hypothesis[names(env$hypothesis) != "coastal"] + env$layers <- subset(env$layers, "wavefetch", negate = T) + if (verbose) cli::cli_alert_warning("Coastal hypothesis found but discarded") + } else { + + wave <- rast("data/env/terrain/wavefetch.tif") + wave_pts <- extract(wave, species_data[,coord_names]) + + if (sum(is.na(wave_pts[,2])) > ceiling(nrow(species_data) * .05)) { + env$hypothesis <- env$hypothesis[names(env$hypothesis) != "coastal"] + if (verbose) cli::cli_alert_warning("Coastal hypothesis found but discarded") + } else if ((nrow(species_data) - sum(is.na(wave_pts[,2]))) < 15) { + env$hypothesis <- env$hypothesis[names(env$hypothesis) != "coastal"] + if (verbose) cli::cli_alert_warning("Coastal hypothesis found but discarded") + } else { + env$layers <- crop(env$layers, europe_starea) + } + } + } + + return(env) +} + +# Check ecoregions ---- +.cm_check_ecoregions <- function(ecoregions, fit_pts, eval_pts, + env, limit_by_depth, depth_buffer) { + + # Check which eco-regions are covered by the points + ecoreg_unique <- ecoregions$Realm[is.related(ecoregions, + vect(bind_rows(fit_pts, eval_pts), + geom = coord_names), + "intersects")] + + model_log$model_details$ecoregions <- ecoreg_unique + ecoreg_occ <- ecoregions[ecoregions$Realm %in% ecoreg_unique,] + + # Apply a buffer to ensure that all areas are covered + sf::sf_use_s2(FALSE) + ecoreg_occ_buff <- suppressMessages( + suppressWarnings(vect(sf::st_buffer(sf::st_as_sf(ecoreg_occ), 0.2))) + ) + + adj_ecoreg <- ecoregions$Realm[is.related(ecoregions, ecoreg_occ_buff, + "intersects")] + + # Mask areas + ecoreg_sel <- ecoregions[ecoregions$Realm %in% unique( + c(ecoreg_unique, adj_ecoreg) + ),] + model_log$model_details$ecoregions_included <- unique(ecoreg_sel$Realm) + + # Apply a buffer to ensure that all areas are covered + sf::sf_use_s2(FALSE) + ecoreg_sel <- suppressMessages( + suppressWarnings(vect(sf::st_buffer(sf::st_as_sf(ecoreg_sel), 0.5))) + ) + + # Limit to max extension based on distance to the points + max_ext <- ext(vect(rbind(fit_pts, eval_pts), geom = coord_names)) + max_ext <- ext(as.vector(max_ext) + c(-20, 20, -20, 20)) + ecoreg_sel <- crop(ecoreg_sel, max_ext) + + # Limit by depth if TRUE + if (limit_by_depth) { + if (verbose) cli::cli_alert_info("Limiting by depth") + # To decide if this step will be kept: + # Load bathymetry layer + bath <- rast("data/env/terrain/bathymetry_mean.tif") + bath <- mask(crop(bath, ecoreg_sel), ecoreg_sel) + + bath_pts <- terra::extract(bath, bind_rows(fit_pts, eval_pts)) + + bath_range <- range(bath_pts[,2]) + bath_range[1] <- bath_range[1] - depth_buffer + bath_range[2] <- ifelse((bath_range[2] + depth_buffer) > 0, + 0, bath_range[2] + depth_buffer) + + bath[bath < bath_range[1] | bath > bath_range[2]] <- NA + + if ("coastal" %in% names(env$hypothesis)) { + bath <- crop(bath, europe_starea) + env$layers <- mask(crop(env$layers, ecoreg_sel), bath) + env$layers <- mask(env$layers, env$layers$wavefetch) + } else { + env$layers <- mask(crop(env$layers, ecoreg_sel), bath) + } + + } else { + if ("coastal" %in% names(env$hypothesis)) { + ecoreg_sel <- crop(ecoreg_sel, europe_starea) + env$layers <- mask(env$layers, ecoreg_sel) + } else { + env$layers <- mask(env$layers, ecoreg_sel) + } + } + + return(env) +} + +# Prepare data object ---- +.cm_prepare_data_obj <- function(fit_pts, eval_pts, env, verbose = FALSE) { + # Assess SAC + fit_pts_sac <- try(obissdm::outqc_sac(fit_pts, + env_layers = subset(env$layers, env$hypothesis[[1]]), + plot_result = FALSE, + verbose = verbose)) + + if (inherits(fit_pts_sac, "try-error")) { + fit_pts_sac <- fit_pts + } + + # Make data object + quad_n <- ifelse(nrow(as.data.frame(env$layers, xy = T, na.rm = T)) < 50000, + round(nrow(as.data.frame(env$layers, xy = T, na.rm = T))), 50000) + + sp_data <- mp_prepare_data(fit_pts_sac, eval_data = eval_pts, + species_id = species_name, + env_layers = env$layers, + quad_number = quad_n, + verbose = verbose) + + block_grid <- get_block_grid(sp_data, env$layers, + sel_vars = env$hypothesis$basevars, + verbose = verbose) + + sp_data <- mp_prepare_blocks(sp_data, + method = "manual", + block_types = "spatial_grid", + n_iterate = 300, + retry_if_zero = TRUE, + manual_shp = block_grid, + verbose = verbose) + + return(sp_data) +} + + +# Assess bias (TODO) ---- +.cm_bias_assess <- function(sp_data, env) { + + # Add some way of assessing spatial bias + model_log$model_details$control_bias <- correct_bias + + # Get spatstat statistics and point process for control + spat_im <- as.im(as.data.frame(aggregate(env$layers[[1]], 10, na.rm = T), xy = T)) + spat_window <- as.owin(spat_im) + # Define ppp object based on point locations + spat_ppp <- ppp(x = sp_data$coord_training$decimalLongitude[sp_data$training$presence == 1], + y = sp_data$coord_training$decimalLatitude[sp_data$training$presence == 1], + window = spat_window) + # calculate envelope around L-hat estimates. + spat_env_k <- envelope(spat_ppp, Kest, verbose = F) + spat_env_l <- envelope(spat_ppp, Lest, verbose = F) + + # Point process model (?) + spat_quad <- ppp(x = sp_data$coord_training$decimalLongitude[sp_data$training$presence == 0], + y = sp_data$coord_training$decimalLatitude[sp_data$training$presence == 0], + window = spat_window) + spat_quad_sc <- quadscheme(data = spat_ppp, dummy = spat_quad, method = "dirichlet") + spat_data_obj <- lapply(1:nlyr(env$layers), function(x){ + as.im(as.data.frame(aggregate(env$layers[[x]], 4, na.rm = T), xy = T)) + }) + names(spat_data_obj) <- names(env$layers) + spat_ppm <- ppm(spat_quad_sc, + trend = as.formula(paste("~", + paste(names(spat_data_obj)[ + names(spat_data_obj) %in% env$hypothesis[[1]] + ], collapse = "+"))), + covariates = spat_data_obj) + + spat_pred <- predict(spat_ppm, covariates = spat_data_obj, ngrid = c(spat_data_obj[[1]]$dim[1], spat_data_obj[[1]]$dim[2])) + + if (correct_bias) { + bias_layer <- density(spat_ppp, sigma = 1) + bias_layer <- rast(bias_layer) + bias_layer <- disagg(bias_layer, 10) + crs(bias_layer) <- crs("EPSG:4326") + bias_layer <- project(bias_layer, env$layers[[1]]) + bias_layer <- mask(bias_layer, env$layers[[1]]) + } + + return(list()) + +} + +# Check good models ---- +.cm_check_good_models <- function(model_fits, tg_metrics = "cbi", tg_threshold = 0.3) { + + # Get what models are above threshold + good_models <- lapply(model_fits, function(model){ + if (!is.null(model)) { + cv_res <- model$cv_metrics + cv_res <- apply(cv_res, 2, mean, na.rm = T) + the_metric <- cv_res[[tg_metrics]] + if (!is.na(the_metric)) { + if (the_metric >= tg_threshold) { + if (sum(is.na(model$cv_metrics[[tg_metrics]])) >= 4) { + return(FALSE) + } else { + return(TRUE) + } + } else { + return(FALSE) + } + } else { + return(FALSE) + } + } else { + return(FALSE) + } + }) + + good_models <- which(unlist(good_models)) + + return(good_models) +} + +# Predict models ---- +.cm_predict_models <- function(good_models, model_fits, best_hyp, hab_depth, + outfolder, species, outacro, + verbose) { + + # Predict models + model_predictions <- lapply(good_models, function(id) { + model_name <- model_fits[[id]]$name + + #best_hyp <- multi_mod_max$best_model + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/") + + if (!dir.exists(outpath)) fs::dir_create(outpath) + + outmess <- paste0(outpath, "taxonid=", species, "_model=", outacro, "_what=mess.tif") + + if (!file.exists(gsub("mess", "mess_cog", outmess))) {do_mess <- TRUE} else {do_mess <- FALSE} + + scenarios <- data.frame( + scenario = c("current", rep(c("ssp116", "ssp245", "ssp370", "ssp460", "ssp585"), + each = 2)), + year = c(NA, rep(c("dec50", "dec100"), 5)) + ) + + for (k in 1:nrow(scenarios)) { + + if (is.na(scenarios$year[k])) { + period <- NULL + } else { + period <- scenarios$year[k] + } + + if (verbose) cli::cli_alert_info("Predicting scenario {k} of {nrow(scenarios)}.") + + env_to_pred <- obissdm::get_envofgroup(group, + depth = hab_depth, load_all = F, + scenario = scenarios$scenario[k], + period = period, + hypothesis = best_hyp, + env_folder = "data/env", + conf_file = "sdm_conf.yml", + verbose = verbose) + + pred <- predict(model_fits[[id]], env_to_pred) + + names(pred) <- paste0(scenarios$scenario[k], ifelse(is.null(period), "", paste0("_", period))) + + if (do_mess) { + # Save MESS + if (!exists("to_mess")) { + maxpt <- nrow(as.data.frame(env_to_pred, xy = T, na.rm = T)) + to_mess <- sample(1:maxpt, + ifelse(maxpt > 10000, 10000, maxpt)) + } + mess_map <- ecospat::ecospat.mess( + na.omit(as.data.frame(env_to_pred, xy = T)[to_mess,]), + cbind(sp_data$coord_training, sp_data$training[,2:ncol(sp_data$training)])) + mess_map_t <- env_to_pred[[1]] + mess_map_t[] <- NA + mess_map_t[cellFromXY(mess_map_t, mess_map[,1:2])] <- mess_map[,5] + mess_map <- mask(mess_map_t, env_to_pred[[1]]) + + names(mess_map) <- names(pred) <- paste0(scenarios$scenario[k], ifelse(is.null(period), "", paste0("_", period))) + + if (k == 1) { + pred_mess <- mess_map + } else { + pred_mess <- c(pred_mess, mess_map) + } + } + + + if (k == 1) { + pred_f <- pred + } else { + pred_f <- c(pred_f, pred) + } + } + + writeRaster(pred_f, paste0(outpath, "taxonid=", species, "_model=", outacro, "_method=", model_name,".tif"), + overwrite = T) + + if (do_mess) { + writeRaster(pred_mess, outmess, overwrite = T) + cogeo_optim(outmess) + } + + return(pred_f[["current"]]) + + }) + + return(model_predictions) +} + +.cm_get_respcurves <- function() { + +} \ No newline at end of file diff --git a/codes/sdm_model_fishes.R b/codes/sdm_model_fishes.R new file mode 100644 index 0000000..eb8a3f0 --- /dev/null +++ b/codes/sdm_model_fishes.R @@ -0,0 +1,180 @@ +############################## MPA Europe project ############################## +########### WP3 - Species and biogenic habitat distributions (UNESCO) ########## +# March of 2024 +# Authors: Silas C. Principe, Pieter Provoost +# Contact: s.principe@unesco.org +# +################## Run multiple species distribution models #################### + + +# Load packages ---- +library(obissdm) +library(furrr) +library(progressr) +library(storr) +library(polars) +library(arrow) +library(dplyr) +library(DBI) +library(terra) +source("../../obis/research/marineheritage_sst/functions/sdm_model_function.R") +set.seed(2023) +handlers("cli") +options("progressr.enable" = TRUE) +setwd("~/Research/mpa_europe/mpaeu_sdm") +fs::dir_create("../../obis/research/marineheritage_sst/results/sdms") + +# Define settings ---- + +# General +# The output folder +outfolder <- "../../obis/research/marineheritage_sst/results/sdms" +# An unique code for identifying this model run +outacro <- "mhs" +# Run in parallel? +run_parallel <- FALSE +# Number of cores for parallel processing +n_cores <- 4 + +# Modelling +# Algorithms to be used +algos <- c("maxent") +# Should areas be masked by the species depth? +limit_by_depth <- TRUE +# A buffer to be applied to the depth limitation +depth_buffer <- 500 +# Assess spatial bias? +assess_bias <- FALSE +# Try to correct spatial bias? +correct_bias <- FALSE + +# Create storr to hold results +st <- storr_rds(paste0(outacro, "_storr")) +# If does not exist, add start date +if (!st$exists("startdate")) { + st$set("startdate", format(Sys.Date(), "%Y%m%d")) +} +# Should the storr object be destructed at the end if the model succeeded? +destroy_storr <- FALSE + +# Get list of species +splist <- read_parquet("~/Research/obis/research/marineheritage_sst/results/species_list.parquet") + +fishes <- splist[splist$group == "fish", ] + +head(fishes) + +fishes$where <- apply(fishes[,3:5], 1, function(x){ + if (x[1] | x[2]) { + "databases" + } else if (!x[1] & !x[2]) { + "edna" + } +}) + +fishes$sdm_group <- "fishes" + + + +# Fit models ---- +# Create a function to save results in a storr object for better control +pmod <- function(sp, gp, sdat, outf, outac, alg, lmd, lmd_buf, assb, corb, p) { + + p() + + if (st$exists(sp)) { + if (st$get(as.character(sp))[[1]] %in% c("done", "succeeded")) { + to_do <- FALSE + } else { + to_do <- TRUE + } + } else { + to_do <- TRUE + } + + if (to_do) { + st$set(sp, "running") + + sp_name <- fishes$species[fishes$AphiaID == sp][1] + + sel_species <- paste0("'", sp_name, "'") + + con <- dbConnect(RSQLite::SQLite(), "~/Research/mpa_europe/mpaeu_shared/occurrence_h3_db.sqlite") + + query <- glue::glue(' + select h3, species, records + from occurrence + where species in ({sel_species}) + ') + + # Get results + res <- dbSendQuery(con, query) + records <- dbFetch(res) + + dbDisconnect(con) + + # Get occurrence coordinates + rec_coords <- h3jsr::cell_to_point(records$h3) + rec_coords <- sf::st_coordinates(rec_coords) + colnames(rec_coords) <- c("decimalLongitude", "decimalLatitude") + rec_coords <- as.data.frame(rec_coords) + rec_coords$species <- sp_name + rec_coords$data_type <- "fit_points" + + #print(head(rec_coords)) + + fit_result <- try(model_species(species = sp, + group = gp, + species_data = rec_coords, + outfolder = outf, + outacro = outac, + algorithms = alg, + limit_by_depth = lmd, + depth_buffer = lmd_buf, + assess_bias = assb, + correct_bias = corb, verbose = T), + silent = F) + + if (!inherits(fit_result, "try-error")) { + st$set(sp, fit_result) + } else { + st$set(sp, list(status = "failed", + error = fit_result)) + } + } + + return(invisible(NULL)) +} + +# Run models according to the strategy +if (run_parallel) { + plan(multisession, workers = n_cores) + + with_progress({ + p <- progressor(steps = nrow(species_list)) + result <- future_map2(species_list$taxonID, species_list$sdm_group, pmod, + outf = outfolder, + outac = outacro, alg = algos, lmd = limit_by_depth, + lmd_buf = depth_buffer, assb = assess_bias, + corb = correct_bias, + p = p, .options = furrr_options(seed = T)) + }) +} else { + with_progress({ + p <- progressor(steps = nrow(fishes[1:2,])) + result <- purrr::map2(fishes$AphiaID[1:2], fishes$sdm_group[1:2], pmod, + outf = outfolder, + outac = outacro, alg = algos, lmd = limit_by_depth, + lmd_buf = depth_buffer, assb = assess_bias, + corb = correct_bias, + p = p) + }) +} + + + +# Check results ---- +# Check if everything was processed +cli::cli_alert_warning("{.val {length(st$list())}} out of {.val {nrow(species_list)}} model{?s} processed.") + +# And if so, destroy storr object \ No newline at end of file diff --git a/codes/sdm_model_fishes_mhs.R b/codes/sdm_model_fishes_mhs.R new file mode 100644 index 0000000..bb672c8 --- /dev/null +++ b/codes/sdm_model_fishes_mhs.R @@ -0,0 +1,203 @@ +############################## MPA Europe project ############################## +########### WP3 - Species and biogenic habitat distributions (UNESCO) ########## +# March of 2024 +# Authors: Silas C. Principe, Pieter Provoost +# Contact: s.principe@unesco.org +# +################## Run multiple species distribution models #################### + + +# Load packages ---- +library(obissdm) +library(furrr) +library(progressr) +library(storr) +library(polars) +library(arrow) +library(dplyr) +library(terra) +source("codes/sdm_components_mhs.R") +source("codes/sdm_model_function_mhs.R") + +setwd("~/Research/mpa_europe/mpaeu_sdm/") +source("functions/auxiliary_modelfit.R") +set.seed(2023) +handlers("cli") +options("progressr.enable" = TRUE) + + +# Define settings ---- +# This file can also be run as "source". In that case, the default objects +# are used. To enable also a more flexible run, for testing purposes, three +# arguments can be override by creating the objects on the environment before. +# `Those are outfolder`, `outacro`, and `sel_species`. The latter have as +# default the value "all", what means that all species in the species list will +# be modeled. Supplying a vector of AphiaID will filter the species based on the +# supplied values. +# +# We create a small function to perform those checks +check_exists <- function(object_name, if_not) { + if (exists(object_name)) { + return(eval(parse(text = object_name))) + } else { + return(if_not) + } +} + +# General +# The output folder +outfolder <- check_exists("outfolder", "~/Research/obis/research/marineheritage_sst/results/sdms") +# An unique code for identifying this model run +outacro <- check_exists("outacro", "mhs") +# What will be modelled? +# 'all' will model all species. Supply a vector of AphiaIDs to filter +sel_species <- check_exists("sel_species", "all") +# The path for the species data dataset +species_dataset <- "~/Research/mpa_europe/mpaeu_shared/occurrence_h3_db.sqlite" +# The path for the species list +species_list <- "~/Research/obis/research/marineheritage_sst/results/fish_paf_species.csv"#"~/Research/obis/research/marineheritage_sst/results/fish_paf_sdmlist.csv" +# Run in parallel? For avoiding parallel, change both to FALSE +run_parallel <- ifelse(length(sel_species) > 1 | sel_species == "all", TRUE, FALSE)[1] +# Number of cores for parallel processing +n_cores <- 4 + +# Modelling +# Algorithms to be used +algos <- c("maxent") +# Should areas be masked by the species depth? +limit_by_depth <- TRUE +# A buffer to be applied to the depth limitation +depth_buffer <- 500 +# Assess spatial bias? +assess_bias <- FALSE +# Try to correct spatial bias? +correct_bias <- FALSE + +# Create storr to hold results +st <- storr_rds(paste0(outacro, "_storr")) +# If does not exist, add start date +if (!st$exists("startdate")) { + st$set("startdate", format(Sys.Date(), "%Y%m%d")) +} +# Should the storr object be destructed at the end if the model succeeded? +destroy_storr <- FALSE + + +# Define species ---- +species_list <- read.csv(species_list) +species_list$sdm_group <- "fishes" +species_list <- species_list %>% + distinct(species, .keep_all = T) +species_list <- species_list[order(species_list$records),] + +if (length(sel_species) > 1 | !any("all" %in% sel_species)) { + species_list <- species_list %>% + filter(AphiaID %in% sel_species) +} + + +# Output for control +cli::cat_rule() +cli::cat_line(cli::col_cyan("MPA EUROPE PROJECT - WP3 Species and biogenic habitat distributions")) +cli::cat_line("Run species distribution models for multiple species in parallel") +cli::cat_line() +cli::cli_inform("Running SDM for {nrow(species_list)} species") +cli::cli_inform("Chosen algorithm{?s}: {.val {algos}}") +if (!limit_by_depth) { + cli::cli_inform(c("x" = "Not limiting by depth")) +} else { + cli::cli_inform(c("v" = "Limiting by depth using {.val {depth_buffer}} buffer")) +} +cli::cat_line() +cli::cli_inform("Outputing to folder {.path {outfolder}} using acronym {.val {outacro}}") +if (dir.exists(outfolder)) { + cli::cli_alert_warning("Folder already exists, files may be overwritten!") +} else { + fs::dir_create(outfolder) +} +if (run_parallel) { + cli::cli_alert_warning("Running in parallel using {n_cores} cores") +} else { + cli::cli_alert_warning("Running in sequence") +} +cli::cat_rule() + + + + +# Fit models ---- +# Create a function to save results in a storr object for better control +pmod <- function(sp, gp, sdat, outf, outac, alg, lmd, lmd_buf, assb, corb, p) { + + p() + + if (st$exists(sp)) { + if (st$get(as.character(sp))[[1]] %in% c("done", "succeeded", "low_data", + "failed", "no_good_model")) { + to_do <- FALSE + } else { + to_do <- TRUE + } + } else { + to_do <- TRUE + } + + if (to_do) { + st$set(sp, "running") + + fit_result <- try(model_species(species = sp, + group = gp, + species_dataset = sdat, + outfolder = outf, + outacro = outac, + algorithms = alg, + limit_by_depth = lmd, + depth_buffer = lmd_buf, + assess_bias = assb, + correct_bias = corb, + verbose = FALSE), + silent = T) + + if (!inherits(fit_result, "try-error")) { + st$set(sp, fit_result) + } else { + st$set(sp, list(status = "failed", + error = fit_result)) + } + } + + return(invisible(NULL)) +} + +# Run models according to the strategy +if (run_parallel) { + plan(multisession, workers = n_cores) + + with_progress({ + p <- progressor(steps = nrow(species_list)) + result <- future_map2(species_list$AphiaID, species_list$sdm_group, pmod, + sdat = species_dataset, outf = outfolder, + outac = outacro, alg = algos, lmd = limit_by_depth, + lmd_buf = depth_buffer, assb = assess_bias, + corb = correct_bias, + p = p, .options = furrr_options(seed = T)) + }) +} else { + with_progress({ + p <- progressor(steps = nrow(species_list)) + result <- purrr::map2(species_list$AphiaID, species_list$sdm_group, pmod, + sdat = species_dataset, outf = outfolder, + outac = outacro, alg = algos, lmd = limit_by_depth, + lmd_buf = depth_buffer, assb = assess_bias, + corb = correct_bias, + p = p) + }) +} + + + +# Check results ---- +# Check if everything was processed +cli::cli_alert_warning("{.val {length(st$list())}} out of {.val {nrow(species_list)}} model{?s} processed.") + +# And if so, destroy storr object \ No newline at end of file diff --git a/codes/sdm_model_function_mhs.R b/codes/sdm_model_function_mhs.R new file mode 100644 index 0000000..60532be --- /dev/null +++ b/codes/sdm_model_function_mhs.R @@ -0,0 +1,941 @@ +############################## MPA Europe project ############################## +########### WP3 - Species and biogenic habitat distributions (UNESCO) ########## +# March of 2024 +# Authors: Silas C. Principe, Pieter Provoost +# Contact: s.principe@unesco.org +# +########################### Main modelling function ############################ + + +#' Model species distribution according to MPA Europe framework +#' +#' @param species AphiaID of the species +#' @param group group of the species, according to the configuration file +#' @param species_dataset the path for the Parquet file containing species +#' occurrences +#' @param outfolder the folder to output the results +#' @param outacro an unique acronym for the modelling +#' @param algorithms which algorithms to fit. See [obissdm::sdm_options()] for a +#' list of available algorithms +#' @param limit_by_depth should the study area be limited by depth? If so, the +#' depth is extracted from occurrence records and after applying the buffer +#' the layers are masked. Recommended to keep `TRUE` +#' @param depth_buffer the depth buffer to be added in case `limit_by_depth = +#' TRUE`. Can be 0 to apply no buffer (not recommended) +#' @param assess_bias if `TRUE`, perform tests for potential spatial bias on +#' occurrence records +#' @param correct_bias if `TRUE`, apply a bias layer to try to correct the +#' spatial bias +#' @param verbose if `TRUE` print essential messages. Can also be numeric: 0 for +#' no messages, 1 for progress messages and 2 for all messages. +#' +#' @return nothing, saved files +#' @export +#' +#' @details +#' This function is a wrapper around functions from the [obissdm] package, to +#' provide a consistent pipeline for generating the SDMs according to the +#' MPA Europe project. +#' +#' Output will be in the format: +#' {outfolder}/taxonid={AphiaID}/model={outacro}/predictions OR models OR metrics/ +#' +#' - predictions: contains all raster files, optimized for cloud (COG) +#' - models: contain model object in zip files +#' - metrics: contain relevant metrics +#' +#' A single `.json` file is saved on the root of the model containing details +#' of the model fit. Also a `.parquet` file is saved with the occurrence data +#' used to fit the model (i.e. already filtered and standardized) +#' +#' @examples +#' \dontrun{ +#' model_species(124316, "benthic_invertebrates", "sp_dataset.parquet", +#' "results", "mr1") +#' } +model_species <- function(species, + group, + species_dataset, + outfolder, + outacro, + algorithms = c("maxent", "rf", "brt", "lasso"), + limit_by_depth = TRUE, + depth_buffer = 500, + assess_bias = TRUE, + correct_bias = TRUE, + verbose = FALSE) { + + # Check verbosity + verb_1 <- verb_2 <- FALSE + if (is.numeric(verbose)) { + if (verbose == 1) { + verb_1 <- TRUE + } else if (verbose == 2) { + verb_1 <- verb_2 <- TRUE + } + } else { + if (verbose) { + verb_1 <- TRUE + } + } + + if (verb_1) cli::cli_alert_info("Starting model for species {species}") + + # Record timing + treg <- obissdm::.get_time() + + # Define global objects and fill log + coord_names <- c("decimalLongitude", "decimalLatitude") + model_log <- obissdm::gen_log(algorithms) + model_log$taxonID <- species + model_log$model_acro <- outacro + model_log$model_date <- Sys.Date() + + # Load species data + if (verbose) cli::cli_alert_info("Reading data") + species_data <- .cm_load_species(species, species_dataset) + + treg <- obissdm::.get_time(treg, "Species data loading") + + # If data is available, proceeds + if (nrow(species_data) >= 15) { + + if (verbose) cli::cli_alert_info("Enough number of points, proceeding") + # PART 1: DATA LOADING ---- + # Load species information + species_name <- species_data$species[1] + model_log$scientificName <- species_name + + # Load ecological information + eco_info <- arrow::open_csv_dataset("data/species_ecoinfo.csv") %>% + filter(taxonID == species) %>% + collect() + + if (nrow(eco_info) < 1) { + eco_info <- obissdm::mp_get_ecoinfo( + species_list = species, + outfile = NULL, + return_table = TRUE, + show_progress = FALSE + ) + } + + # Select ecological information for the species + hab <- eco_info$mode_life + hab_depth <- hab_to_depth(hab) + + # Load environmental data + env <- get_envofgroup(group, depth = hab_depth, load_all = T, + conf_file = "sdm_conf.yml", verbose = verbose) + + env <- .cm_check_coastal(species_data, env, coord_names, verbose) + + + # Load ecoregions shapefile + ecoregions <- vect("data/shapefiles/MarineRealms.shp") + + # Split the dataset + fit_pts <- split_ds(species_data) + eval_pts <- split_ds(species_data, "eval") + + treg <- obissdm::.get_time(treg, "Data loading") + + + + # PART 2: DATA PREPARING ---- + if (verbose) cli::cli_alert_info("Preparing data") + # Check which eco-regions are covered by the points + ecoreg_unique <- ecoregions$Realm[is.related(ecoregions, + vect(bind_rows(fit_pts, eval_pts), + geom = coord_names), + "intersects")] + + model_log$model_details$ecoregions <- ecoreg_unique + ecoreg_occ <- ecoregions[ecoregions$Realm %in% ecoreg_unique,] + + # Apply a buffer to ensure that all areas are covered + sf::sf_use_s2(FALSE) + ecoreg_occ_buff <- suppressMessages( + suppressWarnings(vect(sf::st_buffer(sf::st_as_sf(ecoreg_occ), 0.2))) + ) + + adj_ecoreg <- ecoregions$Realm[is.related(ecoregions, ecoreg_occ_buff, + "intersects")] + + # Mask areas + ecoreg_sel <- ecoregions[ecoregions$Realm %in% unique( + c(ecoreg_unique, adj_ecoreg) + ),] + model_log$model_details$ecoregions_included <- unique(ecoreg_sel$Realm) + + # Apply a buffer to ensure that all areas are covered + sf::sf_use_s2(FALSE) + ecoreg_sel <- suppressMessages( + suppressWarnings(vect(sf::st_buffer(sf::st_as_sf(ecoreg_sel), 0.5))) + ) + + # Limit to max extension based on distance to the points + max_ext <- ext(vect(rbind(fit_pts, eval_pts), geom = coord_names)) + max_ext <- ext(as.vector(max_ext) + c(-20, 20, -20, 20)) + ecoreg_sel <- crop(ecoreg_sel, max_ext) + + # Limit by depth if TRUE + if (limit_by_depth) { + if (verbose) cli::cli_alert_info("Limiting by depth") + # To decide if this step will be kept: + # Load bathymetry layer + bath <- rast("data/env/terrain/bathymetry_mean.tif") + bath <- mask(crop(bath, ecoreg_sel), ecoreg_sel) + + bath_pts <- terra::extract(bath, bind_rows(fit_pts, eval_pts)) + + bath_range <- range(bath_pts[,2]) + bath_range[1] <- bath_range[1] - depth_buffer + bath_range[2] <- ifelse((bath_range[2] + depth_buffer) > 0, + 0, bath_range[2] + depth_buffer) + + bath[bath < bath_range[1] | bath > bath_range[2]] <- NA + + if ("coastal" %in% names(env$hypothesis)) { + bath <- crop(bath, europe_starea) + env$layers <- mask(crop(env$layers, ecoreg_sel), bath) + env$layers <- mask(env$layers, env$layers$wavefetch) + } else { + env$layers <- mask(crop(env$layers, ecoreg_sel), bath) + } + + model_log$model_details$limited_by_depth <- TRUE + model_log$model_details$depth_buffer <- depth_buffer + + } else { + if ("coastal" %in% names(env$hypothesis)) { + ecoreg_sel <- crop(ecoreg_sel, europe_starea) + env$layers <- mask(env$layers, ecoreg_sel) + } else { + env$layers <- mask(env$layers, ecoreg_sel) + } + } + + # Assess SAC + fit_pts_sac <- try(obissdm::outqc_sac(fit_pts, + env_layers = subset(env$layers, env$hypothesis[[1]]), + plot_result = FALSE, + verbose = verbose)) + + if (inherits(fit_pts_sac, "try-error")) { + fit_pts_sac <- fit_pts + } + + # Make data object + quad_n <- ifelse(nrow(as.data.frame(env$layers, xy = T, na.rm = T)) < 50000, + round(nrow(as.data.frame(env$layers, xy = T, na.rm = T))), 50000) + + sp_data <- mp_prepare_data(fit_pts_sac, eval_data = eval_pts, + species_id = species_name, + env_layers = env$layers, + quad_number = quad_n, + verbose = verbose) + + block_grid <- get_block_grid(sp_data, env$layers, + sel_vars = env$hypothesis$basevars, + verbose = verbose) + + sp_data <- mp_prepare_blocks(sp_data, + method = "manual", + block_types = "spatial_grid", + n_iterate = 300, + retry_if_zero = TRUE, + manual_shp = block_grid, + verbose = verbose) + + model_log$model_details$block_size <- unname(sp_data$blocks$grid_resolution) + model_log$model_fit_points <- sum(sp_data$training$presence) + model_log$model_eval_points <- sum(sp_data$eval_data$presence) + + treg <- obissdm::.get_time(treg, "Data preparing") + + # PART 3: ASSESS SPATIAL BIAS ---- + if (assess_bias) { + if (verbose) cli::cli_alert_info("Assessing spatial bias") + # Add some way of assessing spatial bias + model_log$model_details$control_bias <- correct_bias + + # Get spatstat statistics and point process for control + spat_im <- as.im(as.data.frame(aggregate(env$layers[[1]], 10, na.rm = T), xy = T)) + spat_window <- as.owin(spat_im) + # Define ppp object based on point locations + spat_ppp <- ppp(x = sp_data$coord_training$decimalLongitude[sp_data$training$presence == 1], + y = sp_data$coord_training$decimalLatitude[sp_data$training$presence == 1], + window = spat_window) + # calculate envelope around L-hat estimates. + spat_env_k <- envelope(spat_ppp, Kest, verbose = F) + spat_env_l <- envelope(spat_ppp, Lest, verbose = F) + + # Point process model (?) + spat_quad <- ppp(x = sp_data$coord_training$decimalLongitude[sp_data$training$presence == 0], + y = sp_data$coord_training$decimalLatitude[sp_data$training$presence == 0], + window = spat_window) + spat_quad_sc <- quadscheme(data = spat_ppp, dummy = spat_quad, method = "dirichlet") + spat_data_obj <- lapply(1:nlyr(env$layers), function(x){ + as.im(as.data.frame(aggregate(env$layers[[x]], 4, na.rm = T), xy = T)) + }) + names(spat_data_obj) <- names(env$layers) + spat_ppm <- ppm(spat_quad_sc, + trend = as.formula(paste("~", + paste(names(spat_data_obj)[ + names(spat_data_obj) %in% env$hypothesis[[1]] + ], collapse = "+"))), + covariates = spat_data_obj) + + spat_pred <- predict(spat_ppm, covariates = spat_data_obj, ngrid = c(spat_data_obj[[1]]$dim[1], spat_data_obj[[1]]$dim[2])) + + if (correct_bias) { + bias_layer <- density(spat_ppp, sigma = 1) + bias_layer <- rast(bias_layer) + bias_layer <- disagg(bias_layer, 10) + crs(bias_layer) <- crs("EPSG:4326") + bias_layer <- project(bias_layer, env$layers[[1]]) + bias_layer <- mask(bias_layer, env$layers[[1]]) + } + + treg <- obissdm::.get_time(treg, "Sample bias assessment") + } + + + # PART 4: MODEL SELECTION ---- + if (verbose) cli::cli_alert_info("Model selection") + multi_mod_max <- sdm_multhypo(sdm_data = sp_data, sdm_method = "maxent", + variables = env$hypothesis, + verbose = verbose) + + if (correct_bias) { + if (verbose) cli::cli_alert_info("Testing bias correction") + temp_data <- sp_data + temp_data$training <- temp_data$training[, c("presence", multi_mod_max$best_variables)] + temp_data$eval_data <- temp_data$eval_data[, c("presence", multi_mod_max$best_variables)] + + temp_data$training$biasgrid <- extract(bias_layer, temp_data$coord_training, ID = F)[,1] + temp_data$eval_data$biasgrid <- extract(bias_layer, temp_data$coord_eval, ID = F)[,1] + + not_na <- !is.na(temp_data$training$biasgrid) + not_na_eval <- !is.na(temp_data$eval_data$biasgrid) + + temp_data$training <- temp_data$training[not_na,] + temp_data$coord_training <- temp_data$coord_training[not_na,] + + temp_data$eval_data <- temp_data$eval_data[not_na_eval,] + temp_data$coord_eval <- temp_data$coord_eval[not_na_eval,] + + max_bias <- sdm_fit(temp_data) + + bias_metrics <- apply(max_bias$cv_metrics, 2, mean, na.rm = T) + model_metrics <- apply(multi_mod_max$model$cv_metrics, 2, mean, na.rm = T) + + if (bias_metrics[["cbi"]] > (model_metrics[["cbi"]] + 0.1)) { + # TODO: continue here + # TODO: bias layer needs to be other, not from the single species + # group or something - to see + } + } + + # Prepare data for the best model + sp_data$training <- sp_data$training[, c("presence", multi_mod_max$best_variables)] + sp_data$eval_data <- sp_data$eval_data[, c("presence", multi_mod_max$best_variables)] + + treg <- obissdm::.get_time(treg, "Model selection") + + + # PART 4: FIT MODELS ---- + if (verbose) cli::cli_alert_info("Starting model fitting") + model_fits <- list() + for (i in 1:length(algorithms)) { + if (verbose) cli::cli_alert_info("Fitting {algorithms[i]}") + model_fits[[i]] <- try(do.call(paste0("sdm_module_", algorithms[i]), list(sp_data, verbose = verbose))) + if (inherits(model_fits[[i]], "try-error")) { + model_log$model_result[[algorithms[i]]] <- "failed" + model_fits[[i]] <- NULL + } else { + model_log$model_result[[algorithms[i]]] <- "succeeded" + } + } + + # For some reason was not registering in the log... using for instead + # model_fits <- lapply(algorithms, function(algo) { + # fit <- try(do.call(paste0("sdm_module_", algo), list(sp_data))) + # if (inherits(fit, "try-error")) { + # model_log$model_result[[algo]] <- "failed" + # return(NULL) + # } else { + # model_log$model_result[[algo]] <- "succeeded" + # return(fit) + # } + # }) + + treg <- obissdm::.get_time(treg, "Model fit") + + + if (any(unlist(model_log$model_result) == "succeeded")) { + if (verbose) cli::cli_alert_info("Model fitting concluded. Checking if models are good") + # PART 5: PREDICTION ---- + + # Define thresholds + tg_metrics <- "cbi" + tg_threshold <- 0.3 + + # Get what models are above threshold + good_models <- lapply(model_fits, function(model){ + if (!is.null(model)) { + cv_res <- model$cv_metrics + cv_res <- apply(cv_res, 2, mean, na.rm = T) + the_metric <- cv_res[[tg_metrics]] + if (!is.na(the_metric)) { + if (the_metric >= tg_threshold) { + if (sum(is.na(model$cv_metrics[[tg_metrics]])) >= 4) { + return(FALSE) + } else { + return(TRUE) + } + } else { + return(FALSE) + } + } else { + return(FALSE) + } + } else { + return(FALSE) + } + }) + + good_models <- which(unlist(good_models)) + + if (length(good_models) > 0) { + if (verbose) cli::cli_alert_info("Good models available, predicting.") + # Predict models + model_predictions <- lapply(good_models, function(id) { + model_name <- model_fits[[id]]$name + + best_hyp <- multi_mod_max$best_model + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/") + + if (!dir.exists(outpath)) fs::dir_create(outpath) + + outmess <- paste0(outpath, "taxonid=", species, "_model=", outacro, "_what=mess.tif") + + if (!file.exists(gsub("mess", "mess_cog", outmess))) {do_mess <- TRUE} else {do_mess <- FALSE} + + scenarios <- data.frame( + scenario = c("current", rep(c("ssp126", "ssp245", "ssp370", "ssp585"), + each = 1)), + year = c(NA, rep(c("dec50"), 4)) + ) + + for (k in 1:nrow(scenarios)) { + + if (is.na(scenarios$year[k])) { + period <- NULL + } else { + period <- scenarios$year[k] + } + + if (verbose) cli::cli_alert_info("Predicting scenario {k} of {nrow(scenarios)}.") + + env_to_pred <- obissdm::get_envofgroup(group, + depth = hab_depth, load_all = F, + scenario = scenarios$scenario[k], + period = period, + hypothesis = best_hyp, + env_folder = "data/env", + conf_file = "sdm_conf.yml", + verbose = verbose) + + pred <- predict(model_fits[[id]], env_to_pred) + + names(pred) <- paste0(scenarios$scenario[k], ifelse(is.null(period), "", paste0("_", period))) + + if (do_mess) { + # Save MESS + if (!exists("to_mess")) { + maxpt <- nrow(as.data.frame(env_to_pred, xy = T, na.rm = T)) + to_mess <- sample(1:maxpt, + ifelse(maxpt > 10000, 10000, maxpt)) + } + mess_map <- ecospat::ecospat.mess( + na.omit(as.data.frame(env_to_pred, xy = T)[to_mess,]), + cbind(sp_data$coord_training, sp_data$training[,2:ncol(sp_data$training)])) + mess_map_t <- env_to_pred[[1]] + mess_map_t[] <- NA + mess_map_t[cellFromXY(mess_map_t, mess_map[,1:2])] <- mess_map[,5] + mess_map <- mask(mess_map_t, env_to_pred[[1]]) + + names(mess_map) <- names(pred) <- paste0(scenarios$scenario[k], ifelse(is.null(period), "", paste0("_", period))) + + if (k == 1) { + pred_mess <- mess_map + } else { + pred_mess <- c(pred_mess, mess_map) + } + } + + + if (k == 1) { + pred_f <- pred + } else { + pred_f <- c(pred_f, pred) + } + } + + writeRaster(pred_f, paste0(outpath, "taxonid=", species, "_model=", outacro, "_method=", model_name,".tif"), + overwrite = T) + + if (do_mess) { + writeRaster(pred_mess, outmess, overwrite = T) + cogeo_optim(outmess) + } + + return(pred_f[["current"]]) + + }) + + treg <- obissdm::.get_time(treg, "Model prediction") + + + + # PART X: GET RESPONSE CURVES ---- + if (verbose) cli::cli_alert_info("Getting response curves") + # Predict models + model_respcurves <- lapply(good_models, function(id) { + + model_name <- model_fits[[id]]$name + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + if (!dir.exists(outpath)) fs::dir_create(outpath) + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name, + "_what=respcurves.parquet") + + rcurves <- resp_curves(model_fits[[id]], + subset(env$layers, multi_mod_max$best_variables)) + + write_parquet(rcurves, outfile) + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/figures/") + if (!dir.exists(outpath)) fs::dir_create(outpath) + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name, + "_what=responsecurves.png") + + p <- plot(rcurves) + ggtitle(paste("AphiaID:", species, "| Model:", outacro, "| Algo:", model_name)) + ggplot2::ggsave(filename = outfile, plot = p, + width = 10, height = 8) + + return(invisible(NULL)) + + }) + + treg <- obissdm::.get_time(treg, "Response curves") + + + + # PART X: ENSEMBLE ---- + if (length(good_models) > 1) { + if (verbose) cli::cli_alert_info("Enough number of models, ensembling") + + to_ensemble <- list.files(paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/"), + pattern = "\\.tif$", full.names = T) + to_ensemble <- to_ensemble[grepl(paste0(substr(algorithms, 1, 3)[good_models], collapse = "|"), to_ensemble)] + to_ensemble <- lapply(to_ensemble, rast) + lays <- nlyr(to_ensemble[[1]]) + + ensemble <- lapply(1:lays, function(l){ + pull_l <- lapply(to_ensemble, function(r){ + r[[l]] + }) + pull_l <- rast(pull_l) + ensemble_models("median", pull_l) + }) + + ensemble_cv <- rast(lapply(ensemble, function(x) x[[2]])) + ensemble <- rast(lapply(ensemble, function(x) x[[1]])) + + names(ensemble) <- names(ensemble_cv) <- names(to_ensemble[[1]]) + names(ensemble_cv) <- paste0(names(ensemble_cv), "_sd") + + outens <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/", + "taxonid=", species, "_model=", outacro, "_method=ensemble.tif") + + writeRaster(c(ensemble, ensemble_cv), outens, + #wopt=list(verbose=TRUE), + overwrite = T) + + model_predictions <- c(model_predictions, ensemble[[1]]) + + res_ens <- cogeo_optim(outens) + + # Evaluate ensemble + ensemble_curr <- ensemble[[1]] + + fit_ens <- extract(ensemble_curr, sp_data$coord_training, ID = F) + + fit_ens_eval <- obissdm::eval_metrics(sp_data$training$presence, fit_ens[,1]) + + if (!is.null(sp_data$eval_data)) { + eval_ens <- extract(ensemble_curr, sp_data$coord_eval, ID = F) + + eval_ens_eval <- obissdm::eval_metrics(sp_data$eval_data$presence, eval_ens[,1]) + + ensemble_metrics <- rbind(cbind(as.data.frame(t(fit_ens_eval)), origin = "fit_data"), + cbind(as.data.frame(t(eval_ens_eval)), origin = "eval_data")) + } else { + ensemble_metrics <- cbind(as.data.frame(t(fit_ens_eval)), origin = "fit_data") + } + + # Get average of models + avg_fit_metrics <- as.data.frame(t(apply(do.call("rbind", lapply(good_models, function(id){ + t(apply(model_fits[[id]]$cv_metrics, 2, mean, na.rm = T)) + })), 2, mean, na.rm = T))) + avg_fit_metrics$origin <- "avg_fit" + + if (!is.null(sp_data$eval_data)) { + avg_eval_metrics <- as.data.frame(t(apply(do.call("rbind", lapply(good_models, function(id){ + t(model_fits[[id]]$eval_metrics) + })), 2, mean, na.rm = T))) + avg_eval_metrics$origin <- "avg_eval" + avg_fit_metrics <- rbind(avg_fit_metrics, avg_eval_metrics) + } + + ensemble_metrics <- rbind(ensemble_metrics, avg_fit_metrics) + + # TODO: add SD? + + write_parquet(ensemble_metrics, + paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/", + "taxonid=", species, "_model=", outacro, + "_method=ensemble_what=metrics.parquet")) + } + + # Ensemble response curves + if (length(good_models) > 1) { + to_ensemble <- list.files(paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/"), + pattern = "what=resp", full.names = T) + to_ensemble <- to_ensemble[grepl(paste0(substr(algorithms, 1, 3)[good_models], collapse = "|"), to_ensemble)] + to_ensemble <- lapply(to_ensemble, read_parquet) + + init_data <- to_ensemble[[1]] + + for (k in 2:length(to_ensemble)) { + tdf <- data.frame(to_ensemble[[2]][,c("response")]) + colnames(tdf) <- paste0("response_", k) + init_data <- cbind(init_data, tdf) + } + + ens_resp <- apply(init_data[,startsWith(colnames(init_data), "resp")], 1, "median") + + ens_resp_curves <- cbind(init_data[,c("variable", "base")], response = ens_resp) + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=ensemble", + "_what=respcurves.parquet") + + write_parquet(ens_resp_curves, outfile) + + } + + + + # PART X: SAVE BINARIZATION INFO ---- + if (verbose) cli::cli_alert_info("Saving binarization info") + # Get thresholds + # P10 threshold + thresh_p10_mtp <- lapply(model_predictions, function(pred) { + predv <- raster::extract(pred, sp_data$coord_training[sp_data$training$presence == 1,], ID = F)[,1] + p10 <- ceiling(length(predv) * 0.9) + data.frame(p10 = rev(sort(predv))[p10], mtp = min(na.omit(predv))) + }) + + # Max Specificity + Sensitivity + thresh_maxss <- lapply(model_predictions, function(pred) { + pred_p <- raster::extract(pred, sp_data$coord_training[sp_data$training$presence == 1,], ID = F)[,1] + pred_a <- raster::extract(pred, sp_data$coord_training[sp_data$training$presence == 0,], ID = F)[,1] + + pred_eval <- predicts::pa_evaluate(p = pred_p, a = pred_a) + + pred_thresh <- predicts::threshold(pred_eval) + + pred_thresh + }) + + if (length(model_predictions) > length(good_models)) { + names(thresh_p10_mtp) <- names(thresh_maxss) <- c(algorithms[good_models], "ensemble") + } else { + names(thresh_p10_mtp) <- names(thresh_maxss) <- algorithms[good_models] + } + + thresh_p10_mtp <- bind_rows(thresh_p10_mtp, .id = "model") + thresh_maxss <- bind_rows(thresh_maxss, .id = "model") + + thresh <- left_join(thresh_p10_mtp, thresh_maxss, by = "model") + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + write_parquet(thresh, paste0(outpath, + "taxonid=", species, "_model=", outacro, + "_what=thresholds.parquet")) + + + + # PART X: OPTIMIZE GEOTIFF (COG FORMAT) ---- + if (verbose) cli::cli_alert_info("Optimizing rasters (COG)") + + to_optimize <- list.files(paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions"), full.names = T) + to_optimize <- to_optimize[!grepl("aux", to_optimize)] + to_optimize <- to_optimize[!grepl("cog", to_optimize)] + + optimize_result <- lapply(to_optimize, cogeo_optim) + optimize_result <- do.call("rbind", optimize_result) + + + + # PART X: CREATE MASKS AND SAVE ---- + if (verbose) cli::cli_alert_info("Saving masks") + + # Get base raster + base_layer <- rast("data/env/terrain/bathymetry_mean.tif") + base_layer[!is.na(base_layer)] <- 1 + + # Get mask for study area species occurrence + ecoreg_occ_mask <- mask(base_layer, ecoreg_occ) + + # Get mask for study area ecoregions + ecoreg_mask <- mask(base_layer, ecoreg_sel) + + # Get area used for fitting + fit_mask <- terra::extend(env$layers[[1]], model_predictions[[1]]) + fit_mask[!is.na(fit_mask)] <- 1 + + masks <- c(ecoreg_occ_mask, ecoreg_mask, fit_mask) + names(masks) <- c("native_ecoregions", "fit_ecoregions", "fit_region") + + outmask <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/", + "taxonid=", species, "_model=", outacro, "_mask.tif") + terra::writeRaster(masks, outmask, overwrite = T) + mask_opt <- cogeo_optim(outmask) + if (file.exists(paste0(outmask, ".aux.json"))) fs::file_delete(paste0(outmask, ".aux.json")) + + treg <- obissdm::.get_time(treg, "File optimization") + + + + # PART X: EVALUATE MODELS USING OTHER TECHNIQUES ---- + if (verbose) cli::cli_alert_info("Performing post-evaluation") + + # TEST 1: ENVIRONMENTAL SPACE + # Make PCA + # pca_env <- princomp(scale(subset(env$layers, multi_mod_max$best_variables)), + # maxcell = (ncell(env$layers[[1]]) * .5)) + # pca_env_pred <- predict(scale(subset(env$layers, multi_mod_max$best_variables)), pca_env, index = 1:2) + # + # # Extract at data points + # pca_fit <- terra::extract(pca_env_pred, sp_data$coord_training[sp_data$training$presence == 1,], ID = F) + # + # # Extract at SAMPLED data points + # pca_pred <- lapply(model_predictions, function(pred){ + # pred <- mask(pred, masks$fit_region) + # + # samp_pred <- spatSample(pred, size = sum(sp_data$training$presence), method = "weights", values = F, xy = T, na.rm = T) + # + # terra::extract(pca_env_pred, samp_pred, ID = F) + # }) + # + # plot(NULL, xlim = minmax(pca_env_pred)[,1], ylim = minmax(pca_env_pred)[,2]) + # + # points(pca_fit, pch = 20, cex = .5, col = "blue") + # points(pca_pred[[1]], pch = 20, cex = .5, col = "red") + # + # chul1 <- chull(pca_fit) + # chul1 <- c(chul1, chul1[1]) + # chul1 <- vect(as.matrix(pca_fit[chul1,]), type = "polygons") + # + # chul2 <- chull(pca_pred[[1]]) + # chul2 <- c(chul2, chul2[1]) + # chul2 <- vect(as.matrix(pca_pred[[1]][chul2,]), type = "polygons") + # + # lines(chul1, col = "blue") + # lines(chul2, col = "red") + + + # Add Hypervolume option, # TODO + + # TEST 2 - temperature kernel + # Load temperature information + temp_layer <- list.files("data/env/", recursive = T, + pattern = "thetao_baseline", + full.names = T) + temp_layer <- temp_layer[grepl(hab_depth, temp_layer)] + temp_layer <- temp_layer[grepl("mean\\.tif$", temp_layer)] + temp_layer <- rast(temp_layer) + + if (ext(temp_layer) != ext(model_predictions[[1]])) { + temp_layer <- crop(temp_layer, model_predictions[[1]]) + } + + data_fit <- extract(temp_layer, + dplyr::bind_rows(sp_data$coord_training, + sp_data$coord_eval), ID = F) + + kd <- ks::kde(data_fit[,1], h = 8) + percentiles <- ks::qkde(c(0.01, 0.99), kd) + + masked <- temp_layer + masked[temp_layer >= percentiles[1] & temp_layer <= percentiles[2]] <- 3 + masked[temp_layer < percentiles[1] | temp_layer > percentiles[2]] <- 0 + + binary_maps <- lapply(1:length(model_predictions), function(x){ + pred <- model_predictions[[x]] + pred[pred < thresh_p10_mtp[x,]$p10] <- 0 + pred[pred > 0] <- 1 + pred + }) + + plot(masked) + plot(binary_maps[[1]]) + + diff_maps <- lapply(binary_maps, function(x){ + x + masked + }) + + diff_perc <- lapply(diff_maps, function(x){ + x[x == 0] <- NA + x[x == 3] <- NA + result <- as.data.frame(x) + result <- as.data.frame(table(result[,1])) + colnames(result) <- c("status", "frequency") + result$status <- ifelse(result$status == 1, "outside_tenv", "inside_tenv") + result$percentage <- round((result$frequency * 100) / sum(result$frequency), 2) + result + }) + + trange_maps <- lapply(binary_maps, function(x){ + x[x != 1] <- NA + x_pts <- as.data.frame(x, xy = T)[,1:2] + quantile(extract(temp_layer, x_pts, ID = F)[,1], + c(0.01, 0.05, 0.5, 0.95, 0.99), na.rm = T) + }) + + for (gm in 1:length(good_models)) { + salg <- algorithms[good_models[gm]] + model_log$model_posteval[[salg]] <- list( + thermal_range = quantile(data_fit[,1], c(0.01, 0.05, 0.5, 0.95, 0.99)), + thermal_range_binary = trange_maps[[gm]], + thermal_envelope = diff_perc[[gm]] + ) + } + + # jacc <- function(raster1.bin, raster2.bin) { + # combination <- sum(raster1.bin, raster2.bin, na.rm= T) + # intersection <- combination == 2 + # + # # Union is all the area covered by the both rasters + # union <- combination >= 1 + # + # union <- as.data.frame(union) + # inter <- as.data.frame(intersection) + # + # length(inter[inter[,1],]) / length(union[union[,1],]) + # } + # + # overlap_metric <- jacc(masked, crop(binary_maps[[1]], masked)) + # overlap_metric_b <- dismo::nicheOverlap(raster::raster(masked), raster::raster(crop(binary_maps[[1]], masked))) + # + # + # Test 4: niche overlap ecospat + + + + # PART X: SAVE MODELS OBJECTS ---- + if (verbose) cli::cli_alert_info("Saving models objects") + + # Save metrics + save_metrics <- lapply(good_models, function(id){ + model_name <- model_fits[[id]]$name + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name) + + write_parquet(model_fits[[id]]$cv_metrics, paste0(outfile, "_what=cvmetrics.parquet")) + + other_metrics <- list(model_fits[[id]]$full_metrics, + NULL)#model_fits[[id]]$eval_metrics) + names(other_metrics) <- c("full", "eval") + other_metrics <- dplyr::bind_rows(other_metrics, .id = "what") + + write_parquet(other_metrics, paste0(outfile, "_what=fullmetrics.parquet")) + }) + + # Update log with best parameters + for (al in 1:length(algorithms)) { + if (!is.null(model_fits[[al]])) { + model_log$model_bestparams[[al]] <- model_fits[[al]]$parameters + } + } + + # Save models + save_models <- lapply(good_models, function(id) { + + model_name <- model_fits[[id]]$name + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/models/") + if (!dir.exists(outpath)) fs::dir_create(outpath) + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name, + "_what=model.rds") + + saveRDS(model_fits[[id]], file = outfile) + + return(invisible(NULL)) + + }) + + # Save fit points + write_parquet(sp_data$coord_training[sp_data$training$presence == 1,], + paste0(outfolder, "/taxonid=", species, "/model=", outacro, + "/taxonid=", species, "_model=", outacro, "_", + "what=fitocc.parquet")) + + + # Save log and return object + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/") + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_what=log.json") + save_log(model_log, outfile) + + cli::cli_alert_success("Model concluded for species {species}.") + + st_status <- "succeeded" + } else { + if (verbose) cli::cli_alert_warning("No good model available") + st_status <- "no_good_model" + } + + } else { + if (verbose) cli::cli_alert_warning("All models failed") + st_status <- "all_failed" + } + + } else { + if (nrow(species_data) < 15) { + if (verbose) cli::cli_alert_warning("Low number of points, failed") + # If data is low, annotate storr + st_status <- "low_data" + } else { + if (verbose) cli::cli_alert_warning("No data available, failed") + # If data is not available, annotate storr + st_status <- "no_data" + } + } + + # Return nothing, results are saved + return(st_status) + +} diff --git a/functions/sdm_model_function.R b/functions/sdm_model_function.R new file mode 100644 index 0000000..7f09693 --- /dev/null +++ b/functions/sdm_model_function.R @@ -0,0 +1,970 @@ +############################## MPA Europe project ############################## +########### WP3 - Species and biogenic habitat distributions (UNESCO) ########## +# March of 2024 +# Authors: Silas C. Principe, Pieter Provoost +# Contact: s.principe@unesco.org +# +########################### Main modelling function ############################ + + +#' Model species distribution according to MPA Europe framework +#' +#' @param species AphiaID of the species +#' @param group group of the species, according to the configuration file +#' @param species_data dataframe with species +#' occurrences +#' @param outfolder the folder to output the results +#' @param outacro an unique acronym for the modelling +#' @param algorithms which algorithms to fit. See [obissdm::sdm_options()] for a +#' list of available algorithms +#' @param limit_by_depth should the study area be limited by depth? If so, the +#' depth is extracted from occurrence records and after applying the buffer +#' the layers are masked. Recommended to keep `TRUE` +#' @param depth_buffer the depth buffer to be added in case `limit_by_depth = +#' TRUE`. Can be 0 to apply no buffer (not recommended) +#' @param assess_bias if `TRUE`, perform tests for potential spatial bias on +#' occurrence records +#' @param correct_bias if `TRUE`, apply a bias layer to try to correct the +#' spatial bias +#' @param verbose if `TRUE` print messages +#' +#' @return nothing, saved files +#' @export +#' +#' @details +#' This function is a wrapper around functions from the [obissdm] package, to +#' provide a consistent pipeline for generating the SDMs according to the +#' MPA Europe project. +#' +#' Output will be in the format: +#' {outfolder}/taxonid={AphiaID}/model={outacro}/predictions OR models OR metrics/ +#' +#' - predictions: contains all raster files, optimized for cloud (COG) +#' - models: contain model object in zip files +#' - metrics: contain relevant metrics +#' +#' A single `.json` file is saved on the root of the model containing details +#' of the model fit. +#' +#' @examples +#' \dontrun{ +#' model_species(124316, "benthic_invertebrates", "sp_dataset.parquet", +#' "results", "mr1") +#' } +model_species <- function(species, + group, + species_data, + outfolder, + outacro, + algorithms = c("maxent", "rf", "brt", "lasso"), + limit_by_depth = TRUE, + depth_buffer = 500, + assess_bias = TRUE, + correct_bias = TRUE, + verbose = FALSE) { + + if (verbose) cli::cli_alert_info("Starting model for species {species}") + + # Record timing + treg <- obissdm::.get_time() + + # Define global objects and fill log + coord_names <- c("decimalLongitude", "decimalLatitude") + model_log <- obissdm::gen_log(algorithms) + model_log$taxonID <- species + model_log$model_acro <- outacro + model_log$model_date <- Sys.Date() + + treg <- obissdm::.get_time(treg, "Species data loading") + + # If data is available, proceeds + if (nrow(species_data) >= 15) { + + if (verbose) cli::cli_alert_info("Enough number of points, proceeding") + # PART 1: DATA LOADING ---- + # Load species information + species_name <- species_data$species[1] + model_log$scientificName <- species_name + + # Load ecological information + eco_info <- obissdm::mp_get_ecoinfo( + species_list = species, + outfile = NULL, + return_table = TRUE, + show_progress = FALSE + ) + + # Select ecological information for the species + hab <- eco_info$mode_life + hab_depth <- hab_to_depth(hab) + + # Load environmental data + env <- get_envofgroup(group, depth = hab_depth, load_all = T, + conf_file = "sdm_conf.yml", verbose = verbose) + + if ("coastal" %in% names(env$hypothesis)) { + # Test if is within europe/coastal + europe_starea <- vect("data/shapefiles/mpa_europe_starea_v2.shp") + europe_starea <- ext(europe_starea) + + is_within <- is.related(vect(species_data[,coord_names], geom = coord_names), + europe_starea, "intersects") + if (!all(is_within)) { + env$hypothesis <- env$hypothesis[names(env$hypothesis) != "coastal"] + env$layers <- subset(env$layers, "wavefetch", negate = T) + if (verbose) cli::cli_alert_warning("Coastal hypothesis found but discarded") + } else { + + wave <- rast("data/env/terrain/wavefetch.tif") + wave_pts <- extract(wave, species_data[,coord_names]) + + if (sum(is.na(wave_pts[,2])) > ceiling(nrow(species_data) * .05)) { + env$hypothesis <- env$hypothesis[names(env$hypothesis) != "coastal"] + if (verbose) cli::cli_alert_warning("Coastal hypothesis found but discarded") + } else if ((nrow(species_data) - sum(is.na(wave_pts[,2]))) < 15) { + env$hypothesis <- env$hypothesis[names(env$hypothesis) != "coastal"] + if (verbose) cli::cli_alert_warning("Coastal hypothesis found but discarded") + } else { + env$layers <- crop(env$layers, europe_starea) + } + } + } + + + # Load ecoregions shapefile + ecoregions <- vect("data/shapefiles/MarineRealms.shp") + + # Split the dataset + fit_pts <- split_ds(species_data) + eval_pts <- split_ds(species_data, "eval") + + treg <- obissdm::.get_time(treg, "Data loading") + + + + # PART 2: DATA PREPARING ---- + if (verbose) cli::cli_alert_info("Preparing data") + # Check which eco-regions are covered by the points + ecoreg_unique <- ecoregions$Realm[is.related(ecoregions, + vect(bind_rows(fit_pts, eval_pts), + geom = coord_names), + "intersects")] + + model_log$model_details$ecoregions <- ecoreg_unique + ecoreg_occ <- ecoregions[ecoregions$Realm %in% ecoreg_unique,] + + # Apply a buffer to ensure that all areas are covered + sf::sf_use_s2(FALSE) + ecoreg_occ_buff <- suppressMessages( + suppressWarnings(vect(sf::st_buffer(sf::st_as_sf(ecoreg_occ), 0.2))) + ) + + adj_ecoreg <- ecoregions$Realm[is.related(ecoregions, ecoreg_occ_buff, + "intersects")] + + # Mask areas + ecoreg_sel <- ecoregions[ecoregions$Realm %in% unique( + c(ecoreg_unique, adj_ecoreg) + ),] + model_log$model_details$ecoregions_included <- unique(ecoreg_sel$Realm) + + # Apply a buffer to ensure that all areas are covered + sf::sf_use_s2(FALSE) + ecoreg_sel <- suppressMessages( + suppressWarnings(vect(sf::st_buffer(sf::st_as_sf(ecoreg_sel), 0.5))) + ) + + # Limit to max extension based on distance to the points + max_ext <- ext(vect(rbind(fit_pts, eval_pts), geom = coord_names)) + max_ext <- ext(as.vector(max_ext) + c(-20, 20, -20, 20)) + ecoreg_sel <- crop(ecoreg_sel, max_ext) + + # Limit by depth if TRUE + if (limit_by_depth) { + if (verbose) cli::cli_alert_info("Limiting by depth") + # To decide if this step will be kept: + # Load bathymetry layer + bath <- rast("data/env/terrain/bathymetry_mean.tif") + bath <- mask(crop(bath, ecoreg_sel), ecoreg_sel) + + bath_pts <- terra::extract(bath, bind_rows(fit_pts, eval_pts)) + + bath_range <- range(bath_pts[,2]) + bath_range[1] <- bath_range[1] - depth_buffer + bath_range[2] <- ifelse((bath_range[2] + depth_buffer) > 0, + 0, bath_range[2] + depth_buffer) + + bath[bath < bath_range[1] | bath > bath_range[2]] <- NA + + if ("coastal" %in% names(env$hypothesis)) { + bath <- crop(bath, europe_starea) + env$layers <- mask(crop(env$layers, ecoreg_sel), bath) + env$layers <- mask(env$layers, env$layers$wavefetch) + } else { + env$layers <- mask(crop(env$layers, ecoreg_sel), bath) + } + + model_log$model_details$limited_by_depth <- TRUE + model_log$model_details$depth_buffer <- depth_buffer + + } else { + if ("coastal" %in% names(env$hypothesis)) { + ecoreg_sel <- crop(ecoreg_sel, europe_starea) + env$layers <- mask(env$layers, ecoreg_sel) + } else { + env$layers <- mask(env$layers, ecoreg_sel) + } + } + + # Assess SAC + fit_pts_sac <- try(obissdm::outqc_sac(fit_pts, + env_layers = subset(env$layers, env$hypothesis[[1]]), + plot_result = FALSE, + verbose = verbose)) + + if (inherits(fit_pts_sac, "try-error")) { + fit_pts_sac <- fit_pts + } + + # Make data object + quad_n <- ifelse(nrow(as.data.frame(env$layers, xy = T, na.rm = T)) < 50000, + round(nrow(as.data.frame(env$layers, xy = T, na.rm = T))), 50000) + + sp_data <- mp_prepare_data(fit_pts_sac, eval_data = eval_pts, + species_id = species_name, + env_layers = env$layers, + quad_number = quad_n, + verbose = verbose) + + block_grid <- get_block_grid(sp_data, env$layers, + sel_vars = env$hypothesis$basevars, + verbose = verbose) + + sp_data <- mp_prepare_blocks(sp_data, + method = "manual", + block_types = "spatial_grid", + n_iterate = 300, + retry_if_zero = TRUE, + manual_shp = block_grid, + verbose = verbose) + + model_log$model_details$block_size <- unname(sp_data$blocks$grid_resolution) + model_log$model_fit_points <- sum(sp_data$training$presence) + model_log$model_eval_points <- sum(sp_data$eval_data$presence) + + treg <- obissdm::.get_time(treg, "Data preparing") + + # PART 3: ASSESS SPATIAL BIAS ---- + if (assess_bias) { + if (verbose) cli::cli_alert_info("Assessing spatial bias") + # Add some way of assessing spatial bias + model_log$model_details$control_bias <- correct_bias + + # Get spatstat statistics and point process for control + spat_im <- as.im(as.data.frame(aggregate(env$layers[[1]], 10, na.rm = T), xy = T)) + spat_window <- as.owin(spat_im) + # Define ppp object based on point locations + spat_ppp <- ppp(x = sp_data$coord_training$decimalLongitude[sp_data$training$presence == 1], + y = sp_data$coord_training$decimalLatitude[sp_data$training$presence == 1], + window = spat_window) + # calculate envelope around L-hat estimates. + spat_env_k <- envelope(spat_ppp, Kest, verbose = F) + spat_env_l <- envelope(spat_ppp, Lest, verbose = F) + + # Point process model (?) + spat_quad <- ppp(x = sp_data$coord_training$decimalLongitude[sp_data$training$presence == 0], + y = sp_data$coord_training$decimalLatitude[sp_data$training$presence == 0], + window = spat_window) + spat_quad_sc <- quadscheme(data = spat_ppp, dummy = spat_quad, method = "dirichlet") + spat_data_obj <- lapply(1:nlyr(env$layers), function(x){ + as.im(as.data.frame(aggregate(env$layers[[x]], 4, na.rm = T), xy = T)) + }) + names(spat_data_obj) <- names(env$layers) + spat_ppm <- ppm(spat_quad_sc, + trend = as.formula(paste("~", + paste(names(spat_data_obj)[ + names(spat_data_obj) %in% env$hypothesis[[1]] + ], collapse = "+"))), + covariates = spat_data_obj) + + spat_pred <- predict(spat_ppm, covariates = spat_data_obj, ngrid = c(spat_data_obj[[1]]$dim[1], spat_data_obj[[1]]$dim[2])) + + if (correct_bias) { + bias_layer <- density(spat_ppp, sigma = 1) + bias_layer <- rast(bias_layer) + bias_layer <- disagg(bias_layer, 10) + crs(bias_layer) <- crs("EPSG:4326") + bias_layer <- project(bias_layer, env$layers[[1]]) + bias_layer <- mask(bias_layer, env$layers[[1]]) + } + + treg <- obissdm::.get_time(treg, "Sample bias assessment") + } + + + # PART 4: MODEL SELECTION ---- + if (verbose) cli::cli_alert_info("Model selection") + multi_mod_max <- sdm_multhypo(sdm_data = sp_data, sdm_method = "maxent", + variables = env$hypothesis, + verbose = verbose) + + if (correct_bias) { + if (verbose) cli::cli_alert_info("Testing bias correction") + temp_data <- sp_data + temp_data$training <- temp_data$training[, c("presence", multi_mod_max$best_variables)] + temp_data$eval_data <- temp_data$eval_data[, c("presence", multi_mod_max$best_variables)] + + temp_data$training$biasgrid <- extract(bias_layer, temp_data$coord_training, ID = F)[,1] + temp_data$eval_data$biasgrid <- extract(bias_layer, temp_data$coord_eval, ID = F)[,1] + + not_na <- !is.na(temp_data$training$biasgrid) + not_na_eval <- !is.na(temp_data$eval_data$biasgrid) + + temp_data$training <- temp_data$training[not_na,] + temp_data$coord_training <- temp_data$coord_training[not_na,] + + temp_data$eval_data <- temp_data$eval_data[not_na_eval,] + temp_data$coord_eval <- temp_data$coord_eval[not_na_eval,] + + max_bias <- sdm_fit(temp_data) + + bias_metrics <- apply(max_bias$cv_metrics, 2, mean, na.rm = T) + model_metrics <- apply(multi_mod_max$model$cv_metrics, 2, mean, na.rm = T) + + if (bias_metrics[["cbi"]] > (model_metrics[["cbi"]] + 0.1)) { + # TODO: continue here + # TODO: bias layer needs to be other, not from the single species + # group or something - to see + } + } + + # Prepare data for the best model + sp_data$training <- sp_data$training[, c("presence", multi_mod_max$best_variables)] + sp_data$eval_data <- sp_data$eval_data[, c("presence", multi_mod_max$best_variables)] + + treg <- obissdm::.get_time(treg, "Model selection") + + + # PART 4: FIT MODELS ---- + if (verbose) cli::cli_alert_info("Starting model fitting") + model_fits <- list() + for (i in 1:length(algorithms)) { + if (verbose) cli::cli_alert_info("Fitting {algorithms[i]}") + model_fits[[i]] <- try(do.call(paste0("sdm_module_", algorithms[i]), list(sp_data, verbose = verbose))) + if (inherits(model_fits[[i]], "try-error")) { + model_log$model_result[[algorithms[i]]] <- "failed" + model_fits[[i]] <- NULL + } else { + model_log$model_result[[algorithms[i]]] <- "succeeded" + } + } + + treg <- obissdm::.get_time(treg, "Model fit") + + + if (any(unlist(model_log$model_result) == "succeeded")) { + if (verbose) cli::cli_alert_info("Model fitting concluded. Checking if models are good") + # PART 5: PREDICTION ---- + + # Define thresholds + tg_metrics <- "cbi" + tg_threshold <- 0.3 + + # Get what models are above threshold + good_models <- lapply(model_fits, function(model){ + if (!is.null(model)) { + cv_res <- model$cv_metrics + cv_res <- apply(cv_res, 2, mean, na.rm = T) + the_metric <- cv_res[[tg_metrics]] + if (the_metric >= tg_threshold) { + if (sum(is.na(model$cv_metrics[[tg_metrics]])) >= 4) { + return(FALSE) + } else { + return(TRUE) + } + } else { + return(FALSE) + } + } else { + return(FALSE) + } + }) + + good_models <- which(unlist(good_models)) + + if (length(good_models) > 0) { + if (verbose) cli::cli_alert_info("Good models available, predicting.") + # Predict models + model_predictions <- lapply(good_models, function(id) { + model_name <- model_fits[[id]]$name + + best_hyp <- multi_mod_max$best_model + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/") + + if (!dir.exists(outpath)) fs::dir_create(outpath) + + outmess <- paste0(outpath, "taxonid=", species, "_model=", outacro, "_what=mess.tif") + + if (!file.exists(gsub("mess", "mess_cog", outmess))) {do_mess <- TRUE} else {do_mess <- FALSE} + + scenarios <- data.frame( + scenario = c("current", rep(c("ssp116", "ssp245", "ssp370", "ssp460", "ssp585"), + each = 2)), + year = c(NA, rep(c("dec50", "dec100"), 5)) + ) + + for (k in 1:nrow(scenarios)) { + + if (is.na(scenarios$year[k])) { + period <- NULL + } else { + period <- scenarios$year[k] + } + + if (verbose) cli::cli_alert_info("Predicting scenario {k} of {nrow(scenarios)}.") + + env_to_pred <- obissdm::get_envofgroup(group, + depth = hab_depth, load_all = F, + scenario = scenarios$scenario[k], + period = period, + hypothesis = best_hyp, + env_folder = "data/env", + conf_file = "sdm_conf.yml", + verbose = verbose) + + pred <- predict(model_fits[[id]], env_to_pred) + + names(pred) <- paste0(scenarios$scenario[k], ifelse(is.null(period), "", paste0("_", period))) + + if (do_mess) { + # Save MESS + if (!exists("to_mess")) { + maxpt <- nrow(as.data.frame(env_to_pred, xy = T, na.rm = T)) + to_mess <- sample(1:maxpt, + ifelse(maxpt > 10000, 10000, maxpt)) + } + mess_map <- ecospat::ecospat.mess( + na.omit(as.data.frame(env_to_pred, xy = T)[to_mess,]), + cbind(sp_data$coord_training, sp_data$training[,2:ncol(sp_data$training)])) + mess_map_t <- env_to_pred[[1]] + mess_map_t[] <- NA + mess_map_t[cellFromXY(mess_map_t, mess_map[,1:2])] <- mess_map[,5] + mess_map <- mask(mess_map_t, env_to_pred[[1]]) + + names(mess_map) <- names(pred) <- paste0(scenarios$scenario[k], ifelse(is.null(period), "", paste0("_", period))) + + if (k == 1) { + pred_mess <- mess_map + } else { + pred_mess <- c(pred_mess, mess_map) + } + } + + + if (k == 1) { + pred_f <- pred + } else { + pred_f <- c(pred_f, pred) + } + } + + writeRaster(pred_f, paste0(outpath, "taxonid=", species, "_model=", outacro, "_method=", model_name,".tif"), + overwrite = T) + + if (do_mess) { + writeRaster(pred_mess, outmess, overwrite = T) + cogeo_optim(outmess) + } + + return(pred_f[["current"]]) + + }) + + treg <- obissdm::.get_time(treg, "Model prediction") + + + + # PART X: GET RESPONSE CURVES ---- + if (verbose) cli::cli_alert_info("Getting response curves") + # Predict models + model_respcurves <- lapply(good_models, function(id) { + + model_name <- model_fits[[id]]$name + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + if (!dir.exists(outpath)) fs::dir_create(outpath) + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name, + "_what=respcurves.parquet") + + rcurves <- resp_curves(model_fits[[id]], + subset(env$layers, multi_mod_max$best_variables)) + + write_parquet(rcurves, outfile) + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/figures/") + if (!dir.exists(outpath)) fs::dir_create(outpath) + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name, + "_what=responsecurves.png") + + p <- plot(rcurves) + ggtitle(paste("AphiaID:", species, "| Model:", outacro, "| Algo:", model_name)) + ggplot2::ggsave(filename = outfile, plot = p, + width = 10, height = 8) + + return(invisible(NULL)) + + }) + + treg <- obissdm::.get_time(treg, "Response curves") + + + + # PART X: ENSEMBLE ---- + if (length(good_models) > 1) { + if (verbose) cli::cli_alert_info("Enough number of models, ensembling") + + to_ensemble <- list.files(paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/"), + pattern = "\\.tif$", full.names = T) + to_ensemble <- to_ensemble[grepl(paste0(substr(algorithms, 1, 3)[good_models], collapse = "|"), to_ensemble)] + to_ensemble <- lapply(to_ensemble, rast) + lays <- nlyr(to_ensemble[[1]]) + + ensemble <- lapply(1:lays, function(l){ + pull_l <- lapply(to_ensemble, function(r){ + r[[l]] + }) + pull_l <- rast(pull_l) + ensemble_models("median", pull_l) + }) + + ensemble_cv <- rast(lapply(ensemble, function(x) x[[2]])) + ensemble <- rast(lapply(ensemble, function(x) x[[1]])) + + names(ensemble) <- names(ensemble_cv) <- names(to_ensemble[[1]]) + names(ensemble_cv) <- paste0(names(ensemble_cv), "_sd") + + outens <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/", + "taxonid=", species, "_model=", outacro, "_method=ensemble.tif") + + writeRaster(c(ensemble, ensemble_cv), outens, + #wopt=list(verbose=TRUE), + overwrite = T) + + model_predictions <- c(model_predictions, ensemble[[1]]) + + res_ens <- cogeo_optim(outens) + + # Evaluate ensemble + ensemble_curr <- ensemble[[1]] + + fit_ens <- extract(ensemble_curr, sp_data$coord_training, ID = F) + + fit_ens_eval <- obissdm::eval_metrics(sp_data$training$presence, fit_ens[,1]) + + if (!is.null(sp_data$eval_data)) { + eval_ens <- extract(ensemble_curr, sp_data$coord_eval, ID = F) + + eval_ens_eval <- obissdm::eval_metrics(sp_data$eval_data$presence, eval_ens[,1]) + + ensemble_metrics <- rbind(cbind(as.data.frame(t(fit_ens_eval)), origin = "fit_data"), + cbind(as.data.frame(t(eval_ens_eval)), origin = "eval_data")) + } else { + ensemble_metrics <- cbind(as.data.frame(t(fit_ens_eval)), origin = "fit_data") + } + + # Get average of models + avg_fit_metrics <- as.data.frame(t(apply(do.call("rbind", lapply(good_models, function(id){ + t(apply(model_fits[[id]]$cv_metrics, 2, mean, na.rm = T)) + })), 2, mean, na.rm = T))) + avg_fit_metrics$origin <- "avg_fit" + + if (!is.null(sp_data$eval_data)) { + avg_eval_metrics <- as.data.frame(t(apply(do.call("rbind", lapply(good_models, function(id){ + t(model_fits[[id]]$eval_metrics) + })), 2, mean, na.rm = T))) + avg_eval_metrics$origin <- "avg_eval" + avg_fit_metrics <- rbind(avg_fit_metrics, avg_eval_metrics) + } + + ensemble_metrics <- rbind(ensemble_metrics, avg_fit_metrics) + + # TODO: add SD? + + write_parquet(ensemble_metrics, + paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/", + "taxonid=", species, "_model=", outacro, + "_method=ensemble_what=metrics.parquet")) + } + + # Ensemble response curves + if (length(good_models) > 1) { + to_ensemble <- list.files(paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/"), + pattern = "what=resp", full.names = T) + to_ensemble <- to_ensemble[grepl(paste0(substr(algorithms, 1, 3)[good_models], collapse = "|"), to_ensemble)] + to_ensemble <- lapply(to_ensemble, read_parquet) + + init_data <- to_ensemble[[1]] + + for (k in 2:length(to_ensemble)) { + tdf <- data.frame(to_ensemble[[2]][,c("response")]) + colnames(tdf) <- paste0("response_", k) + init_data <- cbind(init_data, tdf) + } + + ens_resp <- apply(init_data[,startsWith(colnames(init_data), "resp")], 1, "median") + + ens_resp_curves <- cbind(init_data[,c("variable", "base")], response = ens_resp) + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=ensemble", + "_what=respcurves.parquet") + + write_parquet(ens_resp_curves, outfile) + + } + + + + # PART X: SAVE BINARIZATION INFO ---- + if (verbose) cli::cli_alert_info("Saving binarization info") + # Get thresholds + # P10 threshold + thresh_p10_mtp <- lapply(model_predictions, function(pred) { + predv <- raster::extract(pred, sp_data$coord_training[sp_data$training$presence == 1,], ID = F)[,1] + p10 <- ceiling(length(predv) * 0.9) + data.frame(p10 = rev(sort(predv))[p10], mtp = min(na.omit(predv))) + }) + + # Max Specificity + Sensitivity + thresh_maxss <- lapply(model_predictions, function(pred) { + pred_p <- raster::extract(pred, sp_data$coord_training[sp_data$training$presence == 1,], ID = F)[,1] + pred_a <- raster::extract(pred, sp_data$coord_training[sp_data$training$presence == 0,], ID = F)[,1] + + pred_eval <- predicts::pa_evaluate(p = pred_p, a = pred_a) + + pred_thresh <- predicts::threshold(pred_eval) + + pred_thresh + }) + + if (length(model_predictions) > length(good_models)) { + names(thresh_p10_mtp) <- names(thresh_maxss) <- c(algorithms[good_models], "ensemble") + } else { + names(thresh_p10_mtp) <- names(thresh_maxss) <- algorithms[good_models] + } + + thresh_p10_mtp <- bind_rows(thresh_p10_mtp, .id = "model") + thresh_maxss <- bind_rows(thresh_maxss, .id = "model") + + thresh <- left_join(thresh_p10_mtp, thresh_maxss, by = "model") + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + write_parquet(thresh, paste0(outpath, + "taxonid=", species, "_model=", outacro, + "_what=thresholds.parquet")) + + + + # PART X: OPTIMIZE GEOTIFF (COG FORMAT) ---- + if (verbose) cli::cli_alert_info("Optimizing rasters (COG)") + + to_optimize <- list.files(paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions"), full.names = T) + to_optimize <- to_optimize[!grepl("aux", to_optimize)] + to_optimize <- to_optimize[!grepl("cog", to_optimize)] + + optimize_result <- lapply(to_optimize, cogeo_optim) + optimize_result <- do.call("rbind", optimize_result) + + + + # PART X: CREATE MASKS AND SAVE ---- + if (verbose) cli::cli_alert_info("Saving masks") + + # Get base raster + base_layer <- rast("data/env/terrain/bathymetry_mean.tif") + base_layer[!is.na(base_layer)] <- 1 + + # Get mask for study area species occurrence + ecoreg_occ_mask <- mask(base_layer, ecoreg_occ) + + # Get mask for study area ecoregions + ecoreg_mask <- mask(base_layer, ecoreg_sel) + + # Get area used for fitting + fit_mask <- terra::extend(env$layers[[1]], model_predictions[[1]]) + fit_mask[!is.na(fit_mask)] <- 1 + + masks <- c(ecoreg_occ_mask, ecoreg_mask, fit_mask) + names(masks) <- c("native_ecoregions", "fit_ecoregions", "fit_region") + + outmask <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/predictions/", + "taxonid=", species, "_model=", outacro, "_mask.tif") + terra::writeRaster(masks, outmask, overwrite = T) + mask_opt <- cogeo_optim(outmask) + if (file.exists(paste0(outmask, ".aux.json"))) fs::file_delete(paste0(outmask, ".aux.json")) + + treg <- obissdm::.get_time(treg, "File optimization") + + + + # PART X: EVALUATE MODELS USING OTHER TECHNIQUES ---- + if (verbose) cli::cli_alert_info("Performing post-evaluation") + + # TEST 1: ENVIRONMENTAL SPACE + # Make PCA + # pca_env <- princomp(scale(subset(env$layers, multi_mod_max$best_variables)), + # maxcell = (ncell(env$layers[[1]]) * .5)) + # pca_env_pred <- predict(scale(subset(env$layers, multi_mod_max$best_variables)), pca_env, index = 1:2) + # + # # Extract at data points + # pca_fit <- terra::extract(pca_env_pred, sp_data$coord_training[sp_data$training$presence == 1,], ID = F) + # + # # Extract at SAMPLED data points + # pca_pred <- lapply(model_predictions, function(pred){ + # pred <- mask(pred, masks$fit_region) + # + # samp_pred <- spatSample(pred, size = sum(sp_data$training$presence), method = "weights", values = F, xy = T, na.rm = T) + # + # terra::extract(pca_env_pred, samp_pred, ID = F) + # }) + # + # plot(NULL, xlim = minmax(pca_env_pred)[,1], ylim = minmax(pca_env_pred)[,2]) + # + # points(pca_fit, pch = 20, cex = .5, col = "blue") + # points(pca_pred[[1]], pch = 20, cex = .5, col = "red") + # + # chul1 <- chull(pca_fit) + # chul1 <- c(chul1, chul1[1]) + # chul1 <- vect(as.matrix(pca_fit[chul1,]), type = "polygons") + # + # chul2 <- chull(pca_pred[[1]]) + # chul2 <- c(chul2, chul2[1]) + # chul2 <- vect(as.matrix(pca_pred[[1]][chul2,]), type = "polygons") + # + # lines(chul1, col = "blue") + # lines(chul2, col = "red") + + + # Add Hypervolume option, # TODO + + # TEST 2 - temperature kernel + # Load temperature information + temp_layer <- list.files("data/env/", recursive = T, + pattern = "thetao_baseline", + full.names = T) + temp_layer <- temp_layer[grepl(hab_depth, temp_layer)] + temp_layer <- temp_layer[grepl("mean\\.tif$", temp_layer)] + temp_layer <- rast(temp_layer) + + if (ext(temp_layer) != ext(model_predictions[[1]])) { + temp_layer <- crop(temp_layer, model_predictions[[1]]) + } + + data_fit <- extract(temp_layer, + dplyr::bind_rows(sp_data$coord_training, + sp_data$coord_eval), ID = F) + + kd <- ks::kde(data_fit[,1], h = 8) + percentiles <- ks::qkde(c(0.01, 0.99), kd) + + masked <- temp_layer + masked[temp_layer >= percentiles[1] & temp_layer <= percentiles[2]] <- 3 + masked[temp_layer < percentiles[1] | temp_layer > percentiles[2]] <- 0 + + binary_maps <- lapply(1:length(model_predictions), function(x){ + pred <- model_predictions[[x]] + pred[pred < thresh_p10_mtp[x,]$p10] <- 0 + pred[pred > 0] <- 1 + pred + }) + + plot(masked) + plot(binary_maps[[1]]) + + diff_maps <- lapply(binary_maps, function(x){ + x + masked + }) + + diff_perc <- lapply(diff_maps, function(x){ + x[x == 0] <- NA + x[x == 3] <- NA + result <- as.data.frame(x) + result <- as.data.frame(table(result[,1])) + colnames(result) <- c("status", "frequency") + result$status <- ifelse(result$status == 1, "outside_tenv", "inside_tenv") + result$percentage <- round((result$frequency * 100) / sum(result$frequency), 2) + result + }) + + trange_maps <- lapply(binary_maps, function(x){ + x[x != 1] <- NA + x_pts <- as.data.frame(x, xy = T)[,1:2] + quantile(extract(temp_layer, x_pts, ID = F)[,1], + c(0.01, 0.05, 0.5, 0.95, 0.99), na.rm = T) + }) + + for (gm in 1:length(good_models)) { + salg <- algorithms[good_models[gm]] + model_log$model_posteval[[salg]] <- list( + thermal_range = quantile(data_fit[,1], c(0.01, 0.05, 0.5, 0.95, 0.99)), + thermal_range_binary = trange_maps[[gm]], + thermal_envelope = diff_perc[[gm]] + ) + } + + # jacc <- function(raster1.bin, raster2.bin) { + # combination <- sum(raster1.bin, raster2.bin, na.rm= T) + # intersection <- combination == 2 + # + # # Union is all the area covered by the both rasters + # union <- combination >= 1 + # + # union <- as.data.frame(union) + # inter <- as.data.frame(intersection) + # + # length(inter[inter[,1],]) / length(union[union[,1],]) + # } + # + # overlap_metric <- jacc(masked, crop(binary_maps[[1]], masked)) + # overlap_metric_b <- dismo::nicheOverlap(raster::raster(masked), raster::raster(crop(binary_maps[[1]], masked))) + # + # + # Test 4: niche overlap ecospat + + + + # PART X: SAVE MODELS OBJECTS ---- + if (verbose) cli::cli_alert_info("Saving models objects") + + # Save metrics + save_metrics <- lapply(good_models, function(id){ + model_name <- model_fits[[id]]$name + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/metrics/") + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name) + + write_parquet(model_fits[[id]]$cv_metrics, paste0(outfile, "_what=cvmetrics.parquet")) + + other_metrics <- list(model_fits[[id]]$full_metrics, + NULL)#model_fits[[id]]$eval_metrics) + names(other_metrics) <- c("full", "eval") + other_metrics <- dplyr::bind_rows(other_metrics, .id = "what") + + write_parquet(other_metrics, paste0(outfile, "_what=fullmetrics.parquet")) + }) + + # Update log with best parameters + for (al in 1:length(algorithms)) { + if (!is.null(model_fits[[al]])) { + model_log$model_bestparams[[al]] <- model_fits[[al]]$parameters + } + } + + # Save models + save_models <- lapply(good_models, function(id) { + + model_name <- model_fits[[id]]$name + + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/models/") + if (!dir.exists(outpath)) fs::dir_create(outpath) + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_method=", model_name, + "_what=model.rds") + + saveRDS(model_fits[[id]], file = outfile) + + return(invisible(NULL)) + + }) + + # Save fit points + write_parquet(sp_data$coord_training[sp_data$training$presence == 1,], + paste0(outfolder, "/taxonid=", species, "/model=", outacro, + "/taxonid=", species, "_model=", outacro, "_", + "what=fitocc.parquet")) + + + # Save log and return object + outpath <- paste0(outfolder, "/taxonid=", species, "/model=", outacro, "/") + outfile <- paste0(outpath, + "taxonid=", species, "_model=", outacro, "_what=log.json") + save_log(model_log, outfile) + + cli::cli_alert_success("Model concluded for species {species}.") + + st_status <- "succeeded" + } else { + if (verbose) cli::cli_alert_warning("No good model available") + st_status <- "no_good_model" + } + + } else { + if (verbose) cli::cli_alert_warning("All models failed") + st_status <- "all_failed" + } + + } else { + if (nrow(species_data) < 15) { + if (verbose) cli::cli_alert_warning("Low number of points, failed") + # If data is low, annotate storr + st_status <- "low_data" + } else { + if (verbose) cli::cli_alert_warning("No data available, failed") + # If data is not available, annotate storr + st_status <- "no_data" + } + } + + # Return nothing, results are saved + return(st_status) + +} + + + +# Auxiliary functions ---- +# Convert eco info to habitat depth +hab_to_depth <- function(hab, default = "depthsurf") { + + if (grepl("benthic|demersal|bottom", hab)) { + dstrata <- "depthmax" + } else if (grepl("pelagic_surface", hab)) { + dstrata <- "depthsurf" + } else if (grepl("NOT_FOUND", hab)) { + dstrata <- default + } else { + dstrata <- "depthmean" + } + + return(dstrata) +} + +# Split dataset into eval and fit +split_ds <- function(sp_occ, + what = "fit", + only_coords = TRUE) { + + if (grepl("fit", what)) { + to_get <- "fit_points" + } else if (grepl("eval", what)) { + to_get <- "eval_points" + } else { + stop("What should be one of `fit_points` or `eval_points`") + } + + pts <- sp_occ %>% + filter(data_type == to_get) %>% + as.data.frame() + + if (nrow(pts) == 0) { + pts <- NULL + } else if (only_coords) { + pts <- pts[,c("decimalLongitude", "decimalLatitude")] + } + + return(pts) +} \ No newline at end of file diff --git a/plots_experimental.R b/plots_experimental.R new file mode 100644 index 0000000..a5ec533 --- /dev/null +++ b/plots_experimental.R @@ -0,0 +1,169 @@ +#################### eDNA expeditions - scientific analysis #################### +########################## Environmental data download ######################### +# January of 2024 +# Author: Silas Principe, Mike Burrows +# Contact: s.principe@unesco.org, michael.burrows@sams.ac.uk +# +########## Read and analyse eDNA data from UNESCO World Heritage Sites ######### + +# Load packages ---- +library(dplyr) +library(stringr) +library(purrr) +library(arrow) +library(data.table) +library(ggplot2) +fs::dir_create("figures") +set.seed(2023) + +# Load files and convert to data.table ---- +# Species summaries +speciestherm <- read_parquet("results/species_tsummaries.parquet") +speciesthermdt <- data.table(speciestherm) + +# Site summaries +speciesthermsite <- read_parquet("results/tsummaries_aggregated.parquet") +speciesthermsitedt <- data.table(speciesthermsite) + +# Species list +specieslist <- read_parquet("results/species_list.parquet") + + +# Select relevant depth/variant ---- +speciesthermsst <- speciesthermdt[depth == "depthsurf" & variant == "mean", , ] +speciesthermsstc <- dcast(speciesthermdt, + species ~ metric, + value.var = "value", + fun = mean) + +# Get community thermal index +ctihighergeog <- speciesthermsitedt[, list( + ctiavg = mean(q_0.5), + sdcti = sd(q_0.5), + str = mean(q_0.9 - q_0.1), + sstavg = mean(site_current), + nspp = .N +), by = c("higherGeography", "where")] + +for (i in 1:100) { + + resampled <- speciesthermsitedt[, list( + q_0.5 = sample(q_0.5, .N, replace = T), + q_0.9 = sample(q_0.9, .N, replace = T), + q_0.1 = sample(q_0.1, .N, replace = T), + site_current = sample(site_current, .N, replace = T) + #resample = 1:.N#sample(1:.N, .N, replace = T) + ), by = c("higherGeography", "where")] + + ctihighergeog_temp <- resampled[, list( + ctiavg = mean(q_0.5), + sdcti = sd(q_0.5), + str = mean(q_0.9 - q_0.1), + sstavg = mean(site_current), + nspp = .N + ), by = c("higherGeography", "where")] + + if (i == 1) { + ctihighergeog_boot <- ctihighergeog_temp + } else { + ctihighergeog_boot <- rbind(ctihighergeog_boot, ctihighergeog_temp) + } + +} + +# Make plots ---- +plot(data=ctihighergeog,ctiavg~sstavg,pch=16,col=unclass(as.factor(where)),cex=1.5, + xlab="Sea surface temperature (°C)", ylab="Community Temperature Index (°C)",main="Fish") + +ggplot(ctihighergeog) + + geom_point(aes(x = sstavg, y = ctiavg, color = where), shape = 16, size = 2) + + xlab("Sea surface temperature (°C)") + + ylab("Community Temperature Index (°C)") + + theme_light() + + theme(panel.grid = element_blank(), + legend.title = element_blank()) + + +### Bootstrap version +ctihighergeog$higherGeography <- substr(ctihighergeog$higherGeography, 1, 10) +ctihighergeog$higherGeography <- factor(ctihighergeog$higherGeography, + levels = unique(ctihighergeog$higherGeography[order(ctihighergeog$sstavg)])) + +ctihighergeog_boot$higherGeography <- substr(ctihighergeog_boot$higherGeography, 1, 10) +ctihighergeog_boot$higherGeography <- factor(ctihighergeog_boot$higherGeography, + levels = levels(ctihighergeog$higherGeography)) + +ctihighergeog_boot_sum <- ctihighergeog_boot[, list( + ctiavg_low = quantile(ctiavg, 0.25), + ctiavg_median = median(ctiavg), + ctiavg_high = quantile(ctiavg, 0.75), + sdcti_low = quantile(sdcti, 0.25), + sdcti_median = median(sdcti), + sdcti_high = quantile(sdcti, 0.75), + str_low = quantile(str, 0.25), + str_median = median(str), + str_high = quantile(str, 0.75) +), by = c("higherGeography", "where")] + +ctihighergeog_sst <- ctihighergeog_boot[,list( + sstavg = mean(sstavg) +), by = c("higherGeography", "where")] +ctihighergeog_sst$what = "Average site SST" + +ggplot() + + geom_point(data = ctihighergeog_sst, aes(x = higherGeography, y = sstavg, fill = what), + shape = "|", size = 6, alpha = .1) + + geom_jitter(data = ctihighergeog_boot, aes(x = higherGeography, y = ctiavg, color = where), + shape = 16, size = 1, alpha = .05, + position = position_jitterdodge(dodge.width = 0.5, + jitter.height = 0, jitter.width = 0.1)) + + geom_linerange(data = ctihighergeog_boot_sum, + aes(x = higherGeography, ymin = ctiavg_low, ymax = ctiavg_high, + color = where), linewidth = 1, + position = position_dodge(width = 0.5)) + + geom_point(data = ctihighergeog, aes(x = higherGeography, y = ctiavg, color = where), + shape = 16, size = 2.5, position = position_dodge(width = 0.5)) + + scale_color_manual(values = c("#00c3c9", "#5151db", "#f58000")) + + scale_fill_manual(values = c("grey70")) + + guides(fill = guide_legend(override.aes = list(alpha = 1))) + + ylab("Temperature (°C)") + + xlab(NULL) + + theme_light() + + coord_flip() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank()) + +ggsave("figures/test_newcti.png", width = 7, height = 10) + +ctihighergeog_sst_b <- ctihighergeog_sst[,list( + sstavg = mean(sstavg) +), by = c("higherGeography")] +ctihighergeog_sst_b$what <- ctihighergeog_sst$what[1] + +ggplot() + + geom_col(data = ctihighergeog_sst_b, aes(x = higherGeography, y = sstavg, fill = what), + alpha = .3, width = 0.8) + + geom_jitter(data = ctihighergeog_boot, aes(x = higherGeography, y = ctiavg, color = where), + shape = 16, size = 1, alpha = .05, + position = position_jitterdodge(dodge.width = 0.5, + jitter.height = 0, jitter.width = 0.1)) + + geom_linerange(data = ctihighergeog_boot_sum, + aes(x = higherGeography, ymin = ctiavg_low, ymax = ctiavg_high, + color = where), linewidth = 1, + position = position_dodge(width = 0.5)) + + geom_point(data = ctihighergeog, aes(x = higherGeography, y = ctiavg, color = where), + shape = 16, size = 2.5, position = position_dodge(width = 0.5)) + + scale_color_manual(values = c("#00c3c9", "#5151db", "#f58000")) + + scale_fill_manual(values = c("grey70")) + + guides(fill = guide_legend(override.aes = list(alpha = 1))) + + scale_y_continuous(expand = c(0,0,0,2)) + + ylab("Temperature (°C)") + + xlab(NULL) + + theme_light() + + coord_flip() + + theme(panel.grid = element_blank(), + panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3, linetype = 2), + legend.title = element_blank()) + +ggsave("figures/test_newcti_opt2.png", width = 7, height = 10) diff --git a/sdm_demonstration.pdf b/sdm_demonstration.pdf new file mode 100644 index 0000000..3bfdaf7 Binary files /dev/null and b/sdm_demonstration.pdf differ diff --git a/sdm_demonstration.qmd b/sdm_demonstration.qmd new file mode 100644 index 0000000..d82c8b1 --- /dev/null +++ b/sdm_demonstration.qmd @@ -0,0 +1,75 @@ +--- +title: "SDMs MHS" +format: pdf +execute: + echo: false +editor: visual +--- + +## Example of application + +Considering only two species to see if its worthwhile: + +1. `r worrms::wm_id2name(217352)` +2. `r worrms::wm_id2name(217426)` + +Load maps and points: + +```{r} +suppressPackageStartupMessages(library(terra)) +suppressPackageStartupMessages(library(arrow)) +terraOptions(progress=0) + +species_a <- rast("results/sdms/taxonid=217352/model=mhs/predictions/taxonid=217352_model=mhs_method=maxnet_cog.tif") +species_b <- rast("results/sdms/taxonid=217426/model=mhs/predictions/taxonid=217426_model=mhs_method=maxnet_cog.tif") + +sp_a_pts <- read_parquet("results/sdms/taxonid=217352/model=mhs/taxonid=217352_model=mhs_what=fitocc.parquet") +sp_b_pts <- read_parquet("results/sdms/taxonid=217426/model=mhs/taxonid=217426_model=mhs_what=fitocc.parquet") + +par(mfrow=c(1,2)) +plot(species_a[[1]], range = c(0,1));points(sp_a_pts, pch = 20, cex = .5) +plot(species_b[[1]], range = c(0,1));points(sp_b_pts, pch = 20, cex = .5) +``` + +Apply masks to isolate to "native" areas: + +```{r} +sp_a_mask <- rast("results/sdms/taxonid=217352/model=mhs/predictions/taxonid=217352_model=mhs_mask_cog.tif") +sp_b_mask <- rast("results/sdms/taxonid=217426/model=mhs/predictions/taxonid=217426_model=mhs_mask_cog.tif") + +species_a <- mask(species_a, sp_a_mask$native_ecoregions) +species_b <- mask(species_b, sp_b_mask$native_ecoregions) + +par(mfrow=c(1,2)) +plot(species_a[[1]], range = c(0,1));points(sp_a_pts, pch = 20, cex = .5) +plot(species_b[[1]], range = c(0,1));points(sp_b_pts, pch = 20, cex = .5) +``` + +Convert to binary format (maybe we can do without it also - in fact, it may be a better choice, but interpretation is a little bit more tricky): + +```{r} +sp_a_th <- read_parquet("results/sdms/taxonid=217352/model=mhs/metrics/taxonid=217352_model=mhs_what=thresholds.parquet") +sp_b_th <- read_parquet("results/sdms/taxonid=217426/model=mhs/metrics/taxonid=217426_model=mhs_what=thresholds.parquet") + +species_a[species_a < mean(sp_a_th$p10[sp_a_th$model == "maxent"])] <- 0 +species_b[species_b < mean(sp_b_th$p10[sp_b_th$model == "maxent"])] <- 0 + +species_a[species_a > 0] <- 1 +species_b[species_b > 0] <- 1 + +par(mfrow=c(1,2)) +plot(species_a[[1]], range = c(0,1));points(sp_a_pts, pch = 20, cex = .5) +plot(species_b[[1]], range = c(0,1));points(sp_b_pts, pch = 20, cex = .5) +``` + +Sum results to get a "richness" indicator: + +```{r} +multi_sp <- sum(species_a, species_b, na.rm = T) + +par(mfrow = c(1,2)) +plot(multi_sp[[1]], main = "Current period") +plot(multi_sp$ssp245_dec50, main = "SSP2 (2050)") +``` + +Once we aggregate multiple species, we can assess how much richness will change in the WHSs.