First results

iobis · Jan 24, 2024 · 6562244 · 6562244
1 parent 5abb363
commit 6562244
Show file tree

Hide file tree

Showing 8 changed files with 600 additions and 130 deletions.
diff --git a/.gitignore b/.gitignore
@@ -52,9 +52,13 @@ rsconnect/
 codes/drafts/
 
 data/
+results/
 
 functions/drafts/
 
 *.zip
 
-temp/
+temp/
+
+*_storr
+*.parquet
diff --git a/README.md b/README.md
@@ -33,15 +33,21 @@ __Temperature by species__
 - `get_edna_marinespecies.R`: aggregate species list from the eDNA project downloaded from the repository https://github.com/iobis/edna-species-lists
 - `temperature_database.R`: aggregate temperature/environmental layers to the OBIS/GBIF database (see details for the database here https://github.com/iobis/protectedseas-statistics)
 - `get_sst_marinespecies.R`: extract temperature information for all species listed on the sites
-- `plot_sst_marinespecies.R`: plot temperature information for a selected species
+- `get_sst_sites.R`: extract temperature information for each site (collection site, not _higherGeography_)
 
 There is also a function on the `functions` folder, `query_sst_marinespecies.R`, that can be used to query full data or summaries of temperature for a species (using the database).
 
 __Others__
 
 - `get_depthprofiles.R`: see which is the actual depth that was used when obtaining the data from Copernicus
 - `generate_quarto_sites.R`: generate quarto pages by site
+- `plot_sst_marinespecies.R`: plot temperature information for a selected species
+
+## Data analysis
+
+After retrieving temperature information for each species occurrence from OBIS/GBIF (aggregated by H3 cells on a resolution of 7), we generate a table with the different quantiles for each species. This is done still on the `get_sst_marinespecies.R` script. The same data is obtained for each site.
 
+We join both tables to analyse the thermal limits for each species and to assess how species in each site might be at risk by climate change. This steps are done on the `analysis_sst_species.R`
 
 ----
 

diff --git a/codes/analysis_sst_species.R b/codes/analysis_sst_species.R
@@ -0,0 +1,140 @@
+#################### eDNA expeditions - scientific analysis ####################
+########################## Environmental data analysis #########################
+# January of 2024
+# Author: Silas C. Principe
+# Contact: [email protected]
+#
+############################# Species thermal limits ###########################
+
+# Load packages ----
+library(arrow)
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+library(patchwork)
+
+
+# Load files ----
+# Temperature per species dataset
+temp_sp <- open_dataset("results/species_tsummaries.parquet")
+
+temp_sp_filt <- temp_sp %>%
+  filter(depth == "depthsurf") %>%
+  filter(variant == "mean") %>%
+  collect()
+
+# Temperature on sites
+temp_sites <- read_parquet("results/sites_tsummaries.parquet")
+
+# Get sites list
+sites_f <- list.files("data/species_list_full", full.names = T)
+
+sites_list <- lapply(sites_f, read.csv)
+names(sites_list) <- gsub("_", " ", gsub("\\.csv", "", basename(sites_f)))
+
+sites_list <- bind_rows(
+  sites_list, .id = "higherGeography"
+)
+
+sites_list_distinct <- sites_list %>%
+  select(species, AphiaID, source_obis, source_gbif, source_dna, higherGeography) %>%
+  group_by(higherGeography) %>%
+  distinct(species, .keep_all = T)
+
+# Get number of records 
+species_rec <- read_parquet("results/records_by_sp.parquet")
+
+
+# Bind information ----
+# Bind species temperature
+temp_sp_wide <- temp_sp_filt %>%
+  pivot_wider(names_from = metric, values_from = value) %>%
+  select(-depth, -variant)
+
+sites_list_binded <- sites_list_distinct %>%
+  left_join(temp_sp_wide) %>%
+  filter(!is.na(q_0.25))
+
+# Bind temperature on sites
+temp_sites_filt <- temp_sites %>%
+  filter(depth == "depthsurf") %>%
+  filter(variant == "mean") %>%
+  group_by(higherGeography, period) %>%
+  summarise(temperature = mean(temperature)) %>%
+  pivot_wider(names_from = period, values_from = temperature) %>%
+  mutate(higherGeography = tolower(higherGeography)) %>%
+  mutate(higherGeography = gsub("[']", " ", higherGeography)) %>%
+  mutate(higherGeography = gsub("[:,]", "", higherGeography))
+
+colnames(temp_sites_filt)[2:length(temp_sites_filt)] <- paste0("site_", colnames(temp_sites_filt)[2:length(temp_sites_filt)])
+
+sites_list_binded <- sites_list_binded %>%
+  left_join(temp_sites_filt) %>%
+  filter(!is.na(site_current))
+
+# Bind number of records 
+sites_list_binded <- sites_list_binded %>%
+  left_join(species_rec)
+
+# Create tag for from where it comes
+sites_list_binded <- sites_list_binded %>%
+  mutate(source_obis = ifelse(as.numeric(source_obis) == 1, 1, 0)) %>%
+  mutate(source_gbif = ifelse(as.numeric(source_gbif) == 1, 2, 0)) %>%
+  mutate(source_dna = ifelse(as.numeric(source_dna) == 1, 4, 0)) %>%
+  mutate(where = source_obis + source_gbif + source_dna) %>%
+  mutate(where = case_when(
+    where == 4 ~ "eDNA",
+    where %in% c(3, 2, 1) ~ "OBIS/GBIF",
+    where %in% c(6, 5, 7) ~ "Both"
+  ))
+
+# Save a copy
+write_parquet(sites_list_binded, "results/tsummaries_aggregated.parquet")
+
+
+# Get thermal ranges ----
+species_thermal_range <- sites_list_binded %>%
+  filter(records >= 10) %>%
+  mutate(loc_range = (site_current - q_0.01)/(q_0.99 - q_0.01))
+
+# Plot to verify
+ggplot(species_thermal_range) +
+  geom_boxplot(aes(x = loc_range))
+
+
+# Create a function to plot for just one site
+plot_site <- function(site) {
+
+  if (is.numeric(site)) {
+    site <- unique(sites_list_binded$higherGeography)[site]
+  }
+
+  sel_site <- sites_list_binded %>%
+    filter(higherGeography == site) %>%
+    mutate(where = ifelse(source_obis & source_gbif & !source_dna, "OBIS/GBIF", 
+                          ifelse(source_dna & !source_obis & !source_gbif, "eDNA", "Both"))) 
+
+  plot_a <- sel_site %>%
+    mutate(status = ifelse(q_0.5 > min(sel_site$site_current), "Above", "Below")) %>%
+    ggplot() +
+    geom_jitter(aes(y = higherGeography, x = q_0.5, color = status), alpha = .2, height = 0.2) +
+    geom_vline(xintercept = min(sel_site$site_current), linetype = 2) +
+    theme_minimal() +
+    ylab(NULL) + xlab (NULL) + ggtitle("All species", site) +
+    theme(legend.title = element_blank(),
+          axis.text.y = element_blank())
+
+  plot_b <- sel_site %>%
+    mutate(status = ifelse(q_0.5 > min(sel_site$site_current), "Above", "Below")) %>%
+    ggplot() +
+    geom_jitter(aes(y = where, x = q_0.5, color = status), alpha = .2, height = 0.2) +
+    geom_vline(xintercept = min(sel_site$site_current), linetype = 2) +
+    theme_minimal() +
+    ylab(NULL) + ggtitle("By type") +
+    theme(legend.title = element_blank())
+
+  plot_a / plot_b + plot_layout(guides = "collect", heights = c(1,2)) & theme(legend.position = "bottom")
+
+}
+
+plot_site(3)
diff --git a/codes/download_temperature_toolbox.R b/codes/download_temperature_toolbox.R
@@ -9,6 +9,10 @@
 # Load packages ----
 library(terra)
 library(reticulate)
+library(arrow)
+library(parallel)
+library(dplyr)
+library(tidyr)
 cm <- import("copernicus_marine_client")
 
 
@@ -113,36 +117,70 @@ for (site_idx in mhs$code) {
 # 
 # # Print loaded dataset information
 # print(sst_l3s)
-
-
-# Convert to parquet
-proc <- job::job({
-  lapply(list.files(outfolder, full.name = T, pattern = "\\.nc"), 
-         function(fname, redo = T) {
-
-           outfile <- gsub("\\.nc", ".parquet", fname)
-
-           if (file.exists(outfile) & !redo) {
-             cat("File already processed, skipping.\n")
-           } else {
-             r <- rast(fname)
-             r_dat <- as.data.frame(r, xy = TRUE, time = TRUE)
-             r_dat <- r_dat %>%
-               pivot_longer(3:length(.), names_to = "time", values_to = "values") %>%
-               mutate(layer = gsub("\\..*", "", names(r))[1])
-             arrow::write_parquet(r_dat, outfile, compression = "gzip")
-           }
-
-           return(invisible(NULL))
-         })
-
-  lf <- list.files(outfolder, full.name = T, pattern = "\\.parquet")
-  base <- list.files(outfolder, full.name = T, pattern = "\\.nc")[1]
-
-  zip("temperature_dayly.zip", c(lf, base))
-
-  job::export("none")
-})
+# 
+# 
+# # Convert to parquet
+# proc <- job::job({
+#   lapply(list.files(outfolder, full.name = T, pattern = "\\.nc"), 
+#          function(fname, redo = F) {
+#            
+#            outfile <- gsub("\\.nc", ".parquet", fname)
+#            
+#            if (file.exists(outfile) & !redo) {
+#              cat("File already processed, skipping.\n")
+#            } else {
+#              cat("Processing", outfile, "\n")
+#              r <- rast(fname)
+#              
+#              if (file.size(fname)/1000/1000 > 1000) {
+#                
+#                cat("Splitting in pieces (large file) \n")
+#                
+#                fs::dir_create("temp")
+#                
+#                grid <- sf::st_make_grid(sf::st_bbox(r), cellsize = 1)
+#                
+#                mclapply(1:length(grid), function(x){
+#                  r_b <- r
+#                  r_b <- crop(r, grid[x])
+#                  r_b <- as.data.frame(r_b, xy = TRUE, time = TRUE)
+#                  r_b <- r_b %>%
+#                    pivot_longer(3:length(.), names_to = "time", values_to = "values") %>%
+#                    mutate(layer = gsub("\\..*", "", names(r)[1]))
+#                  arrow::write_parquet(r_b, sprintf("temp/temp_%06d.parquet", x),
+#                                       compression = "gzip")
+#                }, mc.cores = 3)
+#                
+#                gen_files <- open_dataset("temp")
+#                
+#                gen_files %>%
+#                  write_parquet(., outfile, compression = "gzip")
+#                
+#                fs::dir_delete("temp")
+#              } else {
+#                
+#                r_dat <- as.data.frame(r, xy = TRUE, time = TRUE)
+#                
+#                r_dat <- r_dat %>%
+#                  pivot_longer(3:length(.), names_to = "time", values_to = "values") %>%
+#                  mutate(layer = gsub("\\..*", "", names(r)[1]))
+#                
+#                arrow::write_parquet(r_dat, outfile, compression = "gzip")
+#              }
+#              
+#              
+#            }
+#            
+#            return(invisible(NULL))
+#          })
+#   
+#   lf <- list.files(outfolder, full.name = T, pattern = "\\.parquet")
+#   base <- list.files(outfolder, full.name = T, pattern = "\\.nc")[1]
+#   
+#   zip("temperature_dayly.zip", c(lf, base))
+#   
+#   job::export("none")
+# })