diff --git a/.gitignore b/.gitignore index 19d46bc..cd990df 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,7 @@ po/*~ rsconnect/ codes/drafts/ + +data/ + +functions/drafts/ diff --git a/README.html b/README.html new file mode 100644 index 0000000..7874566 --- /dev/null +++ b/README.html @@ -0,0 +1,422 @@ + + + + + + + + + + + + + +README + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

eDNA expeditions

+
+

Environmental information on Marine Heritage Sites

+

This repository documents the process to obtain and analyse +environmental information (sea temperature and oxygen) on Marine +Heritage Sites. Environmental information is downloaded from Copernicus, +from the following data sources:

+
    +
  • SST:
  • +
  • Oxygen:
  • +
+

You can see the live page at https://iobis.github.io/marineheritage_sst

+
+
+

Codes

+
+

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/README.md b/README.md index f49c73c..dda9080 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,31 @@ -# Understanding how temperature is changing on the Marine Heritage Sites +# eDNA expeditions +## Environmental information on Marine Heritage Sites -Work in progess. Experimental. +This repository documents the process to obtain and analyse environmental information (sea temperature and oxygen) on Marine Heritage Sites. Environmental information is downloaded from Copernicus, from the following data sources: -See the page at https://iobis.github.io/marineheritage_sst +- SST: [Global Ocean Physics Reanalysis](https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_PHY_001_030/description) +- Oxygen: [Global Ocean Biogeochemistry Hindcast](https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_029/description) -To add a section with 'climate vulnerability' for the most abundant/present species (?) +You can see the live page at https://iobis.github.io/marineheritage_sst -![image](https://github.com/iobis/marineheritage_sst/assets/53846571/db63c648-b78a-4154-9559-c042efd3a607) +## Codes + +### Data download + +There are several ways of obtaining Copernicus satellite data. We provide code for three possible pathways: + +1. From WEkEO (`download_temperature_wekeo.ipynb`) - WEkEO is a service from Copernicus that provide a virtual environment (JupyterHub) for satellite data processing. Because all Copernicus data is acessible directly from the virtual environment, this is the fastest way of obtaining the data. WEkEO is free and an account can be created here. Once you have set up your virtual environment, open the Jupyter notebook provided here to download the data. +2. Using OpenDAP (`download_temperature_opendap.R`) - OpenDAP is a data access protocol that is widely used to access satellite data. OpenDAP is also a very quick way to acess Copernicus data and here we adapt code provided by Jorge Assis to subset and download the information we need. Note, however, that the OpenDAP support for Copernicus is being deprecated in mid 2024 in favor of the new Python API (see below). +3. Using the new Copernicus API (`download_*_toolbox.R`) - Copernicus introduced major changes in its Marine Data store in 2023, including a new [toolbox](https://help.marine.copernicus.eu/en/articles/7949409-copernicus-marine-toolbox-introduction) for data access. Unfortunately, the solution is based only on Python and thus there is no R equivalent for it. Using from R relies on system interface to the CLI or link through the `reticulate` package (approach used here). The code `get_depth_profiles.R` is used to obtain the nearest available depth from the chosen depths. + +To be in line with the future changes in the Copernicus services, we adopted the pathway 3 to obtain the data. It is necessary to have **Python** installed and the toolbox ([instructions here](https://help.marine.copernicus.eu/en/articles/7970514-copernicus-marine-toolbox-installation), we recommend using `pip`). + +Oxygen data download proceeded the same way as temperature (use the code with the word "oxygen"). + +### Data processing + +Data analysis for Marine Heat Waves and Cold Spells was done using the package [`heatwaveR`](https://robwschlegel.github.io/heatwaveR/). More details to be added after the workshop. + +---- + + diff --git a/codes/download_oxygen_toolbox.R b/codes/download_oxygen_toolbox.R new file mode 100644 index 0000000..541ec98 --- /dev/null +++ b/codes/download_oxygen_toolbox.R @@ -0,0 +1,136 @@ +#################### eDNA expeditions - scientific analysis #################### +########################## Environmental data download ######################### +# January of 2024 +# Author: Silas C. Principe +# Contact: s.principe@unesco.org +# +##################### Oxygen concentration from Copernicus ##################### + +# Load packages ---- +library(terra) +library(reticulate) +cm <- import("copernicus_marine_client") + + +# Settings ---- +.user <- rstudioapi::askForPassword("Enter your user") +.pwd <- rstudioapi::askForPassword("Enter your password") + +outfolder <- "data/oxygen" +fs::dir_create(outfolder) +filename <- "var=o2" + + +# Load areas ---- +mhs <- vect("data/shapefiles/marine_world_heritage.gpkg") +# Add an index - we put from 0 to keep consistency with the JupytherHub approaches +mhs$code <- 0:(length(mhs)-1) + + +# Define target dataset, time and depths +dataset <- "cmems_mod_glo_bgc_my_0.25_P1D-m" +product <- "globgch" +years <- 1992:2021 +depths <- c(0, 25, 50, 75, 100, 150, 200, 250, 500, 1000, 2000) +variables <- list("o2") +lon_lat_buffer <- 0.5 # in degrees +failed <- c() # To see if any failed + +for (site_idx in mhs$code) { + + sel_site <- mhs[mhs$code == site_idx, ] + + long_range <- ext(sel_site)[1:2] + c(-lon_lat_buffer, lon_lat_buffer) + lat_range <- ext(sel_site)[3:4] + c(-lon_lat_buffer, lon_lat_buffer) + + for (depth in depths) { + + outfile <- paste0(filename, "_site=", site_idx, "_depth=", depth, "_product=", product, ".nc") + + if (!file.exists(paste0(outfolder, "/", outfile))) { + success <- try(cm$subset( + dataset_id = dataset, + variables = variables, + username = .user, + password = .pwd, + minimum_longitude = long_range[1], + maximum_longitude = long_range[2], + minimum_latitude = lat_range[1], + maximum_latitude = lat_range[2], + start_datetime = paste0(min(years), "-01-01T00:00:00"), + end_datetime = paste0(max(years), "-12-31T23:59:59"),#"2022-01-31T23:59:59", + minimum_depth = depth, + maximum_depth = depth+0.5, + output_filename = outfile, + output_directory = outfolder, + force_download = TRUE + ), silent = T) + + # It will rarely fail, but can happen due to server connection problems. + # In those cases, sleep and retry + if (inherits(success, "try-error")) { + cat("Retrying... \n") + Sys.sleep(1) + success <- try(cm$subset( + dataset_id = dataset, + variables = variables, + username = .user, + password = .pwd, + minimum_longitude = long_range[1], + maximum_longitude = long_range[2], + minimum_latitude = lat_range[1], + maximum_latitude = lat_range[2], + start_datetime = paste0(min(years), "-01-01T00:00:00"), + end_datetime = paste0(max(years), "-12-31T23:59:59"),#"2022-01-31T23:59:59", + minimum_depth = depth, + maximum_depth = depth+0.5, + output_filename = outfile, + output_directory = outfolder, + force_download = TRUE + ), silent = T) + if (inherits(success, "try-error")) {failed <- c(failed, outfile)} + } + } else { + cat(glue::glue("File for site {site_idx} depth {depth} already exists. Skipping.\n")) + } + + } + +} + +# Use the code below to explore a dataset +# +# sst_l3s = cm$open_dataset( +# dataset_id = dataset, +# #variables = variables, +# username = .user, +# password = .pwd, +# minimum_longitude = -10, +# maximum_longitude = 10, +# minimum_latitude = -10, +# maximum_latitude = 10 +# ) +# +# # Print loaded dataset information +# print(sst_l3s) + + +# Convert to parquet +proc <- job::job({ + lapply(list.files(outfolder, full.name = T, pattern = "\\.nc"), + function(fname, redo = T) { + + outfile <- gsub("\\.nc", ".parquet", fname) + + if (file.exists(outfile) & !redo) { + cat("File already processed, skipping.\n") + } else { + r <- rast(fname) + r_dat <- as.data.frame(r, xy = TRUE, time = TRUE, wide = FALSE) + r_dat <- subset(r_dat, select = -layer) + arrow::write_parquet(r_dat, outfile, compression = "gzip") + } + + return(invisible(NULL)) + }) +}) diff --git a/codes/download_sst_current.R b/codes/download_sst_current.R deleted file mode 100644 index 92b306d..0000000 --- a/codes/download_sst_current.R +++ /dev/null @@ -1,214 +0,0 @@ -############# eDNA Expeditions - UNESCO World Heritage Marine Sites ############ -################# Data analysis of the project - Thermal niche ################# -# August of 2023 -# Authors: Silas C. Principe, Pieter Provoost -# Contact: s.principe@unesco.org -# -################ Retrieve SST satellite data for current period ################ - -# Load packages and define settings ---- -library(terra) -library(sf) -library(dplyr) -library(ggplot2) -library(arrow) -# Path to save files -save_path <- "data/sst/current/" - - -# Load Python functions ---- -reticulate::source_python("functions/inspect_cop.py") -# reticulate::py_help(inspect_cop) # To see help run this line -reticulate::source_python("functions/retrieve_sst_fun.py") -# reticulate::py_help(retrieve_cop) # To see help run this line - - -# Get credentials for Copernicus Marine service ---- -cop_user <- rstudioapi::askForPassword("Copernicus username") -cop_pass <- rstudioapi::askForPassword("Copernicus password") - -# Load marine heritage sites shapefile ---- -mhs_path <- "data/shapefiles/WorldMarineHeritageSites_v2.shp" -mhs <- st_read(mhs_path) - -# Limit to those that are confirmed on the website -# (ask for a csv list) -mhs <- mhs[grepl(paste0( - c( - "wadden", "shark", "noronha", "rocas", "french austral", "lord howe", - "sundarbans", "coiba", "revillagigedo", "caledonia", "calanche", - "sanganeb", "everglades", "aldabra", "belize", "tubbataha", - "simangaliso", "banc", "ningaloo", "socotra", "Península Valdés" - ), collapse = "|" -), mhs$FULL_NAME, ignore.case = T),] - -mhs_sites <- unique(mhs$MRGID) - -# Oceans (optional) # -oceans <- mregions::mr_shp(key = "MarineRegions:goas", read = TRUE, maxFeatures = 1000) -oceans <- st_as_sf(oceans) - -inter <- st_intersects(mhs, st_make_valid(oceans), sparse = F) - -mhs$OCEAN <- oceans$name[apply(inter, 1, which.max)] - - -# Open dataset ---- -# Dataset name -dataset <- "METOFFICE-GLO-SST-L4-REP-OBS-SST" -# Variable of interest -variable <- "analysed_sst" -# Time range -time_window <- 1992:2021 - -# Inspect and open dataset -dataset_info <- inspect_cop(dataset, cop_user, cop_pass, variable = variable, plot = T) - - -# Get mean, maximum, minimum and sd for each site and save ---- -# Because all will have the same configurations, we first create a function -get_metric <- function(site, metric, time_window) { - retrieve_cop(c(dataset, dataset_info), # Supplying dataset name and dataset object - variable, # Variable to open - cop_user, cop_pass, # Copernicus access information - shape = mhs_path, # Shapefile path - shape_var = "MRGID", # Shapefile variable to filter - shape_filter = site, # Filter value - time_range = time_window, # Time range - ret_type = "df", # Returning type - metric = metric, # Metric of summary - group_by_lonlat = TRUE, # Group by grouper and summarise by whole raster - res_by_month = TRUE, # Aggregate by month - grouper = "time", # Get for each time step - mask_by_shape = FALSE, # Mask by the shapefile - k_to_celsius = TRUE, # Convert from degree to celsius - plot = F) # Plot during execution -} - -# We run in loop, that way if there is any problem we can try again -sites_metrics <- list() - -for (z in 16:length(mhs_sites)) { - # We load the dataset again at each start to avoid the server disconnecting - dataset_info <- inspect_cop(dataset, cop_user, cop_pass, variable = variable, plot = F) - for (k in 1:length(time_window)) { - tr <- paste0(time_window[k], c("-01-01", "-12-31")) - temp_data <- get_metric(mhs_sites[z], - metric = c("mean", "std", "max", "min", "median"), - time_window = tr) - if (k == 1) { - sites_metrics[[z]] <- temp_data - } else { - sites_metrics[[z]] <- rbind(sites_metrics[[z]], - temp_data) - } - } -} - -# -# sites_metrics <- lapply(mhs_sites, get_metric, -# metric = c("mean", "std", "max", "min", "median")) - -# Convert to a single data frame containing sites info -sites_metrics <- lapply(sites_metrics, function(x){x$time <- rownames(x);x}) -names(sites_metrics) <- mhs_sites - -sites_metrics <- bind_rows(sites_metrics, .id = "MRGID") - -sites_info <- st_drop_geometry(mhs[,c("FULL_NAME", "COUNTRY", "MRGID")]) -sites_info <- distinct(sites_info, MRGID, .keep_all = T) - -sites_metrics <- left_join(sites_metrics, sites_info, by = "MRGID") - - -# Save results ---- -write_parquet(sites_metrics, paste0(save_path, "mhs_sst_current.parquet")) - - -# Plot results ---- -sites_metrics$time <- lubridate::as_date(sites_metrics$time) - -sites_metrics <- sites_metrics %>% - group_by(MRGID) %>% - mutate(general_mean = mean(mean)) %>% - mutate(detrend = mean - general_mean) %>% - select(-general_mean) - -ggplot(sites_metrics, aes(x = time, y = mean))+ - geom_line() + - geom_ribbon(aes(ymin = mean-sd, ymax = mean+sd), alpha = .3)+ - theme_light() + - facet_wrap(~ MRGID, scales = "free_y") - -sites_metrics$state <- ifelse(sites_metrics$detrend > 0, "Higher", "Lower") - -ggplot(sites_metrics, aes(x = time, y = detrend))+ - geom_hline(yintercept = 0) + - #geom_line(aes(color = state)) + - geom_area(aes(x=time, y=ifelse(detrend<0, detrend, 0)), fill="#1093C8") + - geom_area(aes(x=time, y=ifelse(detrend>0, detrend, 0)), fill="#C72B10") + - theme_bw() + - facet_wrap(~ MRGID, scales = "free_y") - - - - - - - -#### -# Animated grap -library(dygraphs) -library(xts) - -wide_metrics <- sites_metrics %>% - select(time, MRGID, mean, sd) %>% - filter(MRGID %in% c(26836, 64215)) %>% - mutate(upr = mean+sd, lwr = mean-sd) %>% - select(-sd) %>% - tidyr::pivot_wider(names_from = MRGID, values_from = c(mean, upr, lwr)) - -smetric <- xts(x = wide_metrics, order.by = wide_metrics$time) - -# Finally the plot -(p <- dygraph(smetric) %>% - dySeries(c("lwr_26836", "mean_26836", "upr_26836"), label = "26836") %>% - dySeries(c("lwr_64215", "mean_64215", "upr_64215"), label = "64215") %>% - dyOptions(labelsUTC = TRUE, fillGraph=FALSE, fillAlpha=0.1, drawGrid = FALSE, - colors = RColorBrewer::brewer.pal(3, "Set2")) %>% - dyAxis("y", label = "Temperature (°C)") %>% - dyRangeSelector() %>% - dyCrosshair(direction = "vertical") %>% - dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE, - highlightSeriesOpts = list(strokeWidth = 2)) #%>% - #dyRoller(rollPeriod = 1) -) - - -smetric_a <- xts(x = wide_metrics[,c("time", "mean_26836", "lwr_26836", "upr_26836")], order.by = wide_metrics$time) -smetric_b <- xts(x = wide_metrics[,c("time", "mean_64215", "lwr_64215", "upr_64215")], order.by = wide_metrics$time) - - -(p1 <- dygraph(smetric_a, group = "sst") %>% - dySeries(c("lwr_26836", "mean_26836", "upr_26836"), label = "26836") %>% - dyOptions(labelsUTC = TRUE, fillGraph=FALSE, fillAlpha=0.1, drawGrid = FALSE, - colors = RColorBrewer::brewer.pal(3, "Set2")) %>% - dyAxis("y", label = "Temperature (°C)") %>% - dyRangeSelector() %>% - dyCrosshair(direction = "vertical") %>% - dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE, - highlightSeriesOpts = list(strokeWidth = 2)) #%>% - #dyRoller(rollPeriod = 1) -) - -(p2 <- dygraph(smetric_b, group = "sst") %>% - dySeries(c("lwr_64215", "mean_64215", "upr_64215"), label = "64215") %>% - dyOptions(labelsUTC = TRUE, fillGraph=FALSE, fillAlpha=0.1, drawGrid = FALSE, - colors = RColorBrewer::brewer.pal(3, "Set2")) %>% - dyAxis("y", label = "Temperature (°C)") %>% - dyRangeSelector() %>% - dyCrosshair(direction = "vertical") %>% - dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE, - highlightSeriesOpts = list(strokeWidth = 2)) #%>% - #dyRoller(rollPeriod = 1) -) \ No newline at end of file diff --git a/codes/download_sst_current_plotbymonth.R b/codes/download_sst_current_plotbymonth.R deleted file mode 100644 index 3ee2f90..0000000 --- a/codes/download_sst_current_plotbymonth.R +++ /dev/null @@ -1,220 +0,0 @@ -############# eDNA Expeditions - UNESCO World Heritage Marine Sites ############ -################# Data analysis of the project - Thermal niche ################# -# August of 2023 -# Authors: Silas C. Principe, Pieter Provoost -# Contact: s.principe@unesco.org -# -################ Retrieve SST satellite data for current period ################ - -# Load packages and define settings ---- -library(terra) -library(sf) -library(dplyr) -library(ggplot2) -library(arrow) -# Path to save files -save_path <- "data/sst/current/" - - -# Load Python functions ---- -reticulate::source_python("functions/inspect_cop.py") -# reticulate::py_help(inspect_cop) # To see help run this line -reticulate::source_python("functions/retrieve_sst_fun.py") -# reticulate::py_help(retrieve_cop) # To see help run this line - - -# Get credentials for Copernicus Marine service ---- -cop_user <- rstudioapi::askForPassword("Copernicus username") -cop_pass <- rstudioapi::askForPassword("Copernicus password") - -# Load marine heritage sites shapefile ---- -mhs_path <- "data/shapefiles/WorldMarineHeritageSites_v2.shp" -mhs <- st_read(mhs_path) - -# Limit to those that are confirmed on the website -# (ask for a csv list) -mhs <- mhs[grepl(paste0( - c( - "wadden", "shark", "noronha", "rocas", "french austral", "lord howe", - "sundarbans", "coiba", "revillagigedo", "caledonia", "calanche", - "sanganeb", "everglades", "aldabra", "belize", "tubbataha", - "simangaliso", "banc", "ningaloo", "socotra", "Península Valdés" - ), collapse = "|" -), mhs$FULL_NAME, ignore.case = T),] - -mhs_sites <- unique(mhs$MRGID) - -# Oceans (optional) # -oceans <- mregions::mr_shp(key = "MarineRegions:goas", read = TRUE, maxFeatures = 1000) -oceans <- st_as_sf(oceans) - -inter <- st_intersects(mhs, st_make_valid(oceans), sparse = F) - -mhs$OCEAN <- oceans$name[apply(inter, 1, which.max)] - - -# Open dataset ---- -# Dataset name -dataset <- "METOFFICE-GLO-SST-L4-REP-OBS-SST" -# Variable of interest -variable <- "analysed_sst" -# Time range -time_window <- 1992:2021 - -# Inspect and open dataset -dataset_info <- inspect_cop(dataset, cop_user, cop_pass, variable = variable, plot = T) - - -# Get mean, maximum, minimum and sd for each site and save ---- -# Because all will have the same configurations, we first create a function -get_metric <- function(site, metric, time_window) { - retrieve_cop(c(dataset, dataset_info), # Supplying dataset name and dataset object - variable, # Variable to open - cop_user, cop_pass, # Copernicus access information - shape = mhs_path, # Shapefile path - shape_var = "MRGID", # Shapefile variable to filter - shape_filter = site, # Filter value - time_range = time_window, # Time range - ret_type = "df", # Returning type - metric = metric, # Metric of summary - group_by_lonlat = TRUE, # Group by grouper and summarise by whole raster - res_by_month = TRUE, # Aggregate by month - grouper = "time", # Get for each time step - mask_by_shape = FALSE, # Mask by the shapefile - k_to_celsius = TRUE, # Convert from degree to celsius - plot = F) # Plot during execution -} - -# We run in loop, that way if there is any problem we can try again -sites_metrics <- list() - -for (z in 16:length(mhs_sites)) { - # We load the dataset again at each start to avoid the server disconnecting - dataset_info <- inspect_cop(dataset, cop_user, cop_pass, variable = variable, plot = F) - for (k in 1:length(time_window)) { - tr <- paste0(time_window[k], c("-01-01", "-12-31")) - temp_data <- get_metric(mhs_sites[z], - metric = c("mean", "std", "max", "min", "median"), - time_window = tr) - if (k == 1) { - sites_metrics[[z]] <- temp_data - } else { - sites_metrics[[z]] <- rbind(sites_metrics[[z]], - temp_data) - } - } -} - -# -# sites_metrics <- lapply(mhs_sites, get_metric, -# metric = c("mean", "std", "max", "min", "median")) - -# Convert to a single data frame containing sites info -sites_metrics <- lapply(sites_metrics, function(x){x$time <- rownames(x);x}) -names(sites_metrics) <- mhs_sites - -sites_metrics <- bind_rows(sites_metrics, .id = "MRGID") - -sites_info <- st_drop_geometry(mhs[,c("FULL_NAME", "COUNTRY", "MRGID")]) -sites_info <- distinct(sites_info, MRGID, .keep_all = T) - -sites_metrics <- left_join(sites_metrics, sites_info, by = "MRGID") - - -# Save results ---- -write_parquet(sites_metrics, paste0(save_path, "mhs_sst_current.parquet")) - - -# Plot results ---- -sites_metrics$time <- lubridate::as_date(sites_metrics$time) - -sites_metrics <- sites_metrics %>% - group_by(MRGID, month = month(time), year = year(time)) %>% - summarise(mean = mean(mean), sd = mean(sd), max = mean(max), min = mean(min), median = mean(median)) %>% - ungroup() %>% - group_by(MRGID) %>% - mutate(general_mean = mean(mean)) %>% - mutate(detrend = mean - general_mean) %>% - mutate(day = 1) %>% - unite(date, c("day", "month", "year"), sep = "-") %>% - mutate(date = as_date(date, format = "%d-%m-%Y")) %>% - select(-general_mean) - -ggplot(sites_metrics, aes(x = date, y = mean))+ - geom_line() + - geom_ribbon(aes(ymin = mean-sd, ymax = mean+sd), alpha = .3)+ - theme_light() + - facet_wrap(~ MRGID, scales = "free_y") - -sites_metrics$state <- ifelse(sites_metrics$detrend > 0, "Higher", "Lower") - -ggplot(sites_metrics, aes(x = date, y = detrend))+ - geom_hline(yintercept = 0) + - #geom_line(aes(color = state)) + - geom_area(aes(y=ifelse(detrend<0, detrend, 0)), fill="#1093C8") + - geom_area(aes(y=ifelse(detrend>0, detrend, 0)), fill="#C72B10") + - theme_bw() + - facet_wrap(~ MRGID, scales = "free_y") - - - - - - - -#### -# Animated grap -library(dygraphs) -library(xts) - -wide_metrics <- sites_metrics %>% - select(time, MRGID, mean, sd) %>% - filter(MRGID %in% c(26836, 64215)) %>% - mutate(upr = mean+sd, lwr = mean-sd) %>% - select(-sd) %>% - tidyr::pivot_wider(names_from = MRGID, values_from = c(mean, upr, lwr)) - -smetric <- xts(x = wide_metrics, order.by = wide_metrics$time) - -# Finally the plot -(p <- dygraph(smetric) %>% - dySeries(c("lwr_26836", "mean_26836", "upr_26836"), label = "26836") %>% - dySeries(c("lwr_64215", "mean_64215", "upr_64215"), label = "64215") %>% - dyOptions(labelsUTC = TRUE, fillGraph=FALSE, fillAlpha=0.1, drawGrid = FALSE, - colors = RColorBrewer::brewer.pal(3, "Set2")) %>% - dyAxis("y", label = "Temperature (°C)") %>% - dyRangeSelector() %>% - dyCrosshair(direction = "vertical") %>% - dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE, - highlightSeriesOpts = list(strokeWidth = 2)) #%>% - #dyRoller(rollPeriod = 1) -) - - -smetric_a <- xts(x = wide_metrics[,c("time", "mean_26836", "lwr_26836", "upr_26836")], order.by = wide_metrics$time) -smetric_b <- xts(x = wide_metrics[,c("time", "mean_64215", "lwr_64215", "upr_64215")], order.by = wide_metrics$time) - - -(p1 <- dygraph(smetric_a, group = "sst") %>% - dySeries(c("lwr_26836", "mean_26836", "upr_26836"), label = "26836") %>% - dyOptions(labelsUTC = TRUE, fillGraph=FALSE, fillAlpha=0.1, drawGrid = FALSE, - colors = RColorBrewer::brewer.pal(3, "Set2")) %>% - dyAxis("y", label = "Temperature (°C)") %>% - dyRangeSelector() %>% - dyCrosshair(direction = "vertical") %>% - dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE, - highlightSeriesOpts = list(strokeWidth = 2)) #%>% - #dyRoller(rollPeriod = 1) -) - -(p2 <- dygraph(smetric_b, group = "sst") %>% - dySeries(c("lwr_64215", "mean_64215", "upr_64215"), label = "64215") %>% - dyOptions(labelsUTC = TRUE, fillGraph=FALSE, fillAlpha=0.1, drawGrid = FALSE, - colors = RColorBrewer::brewer.pal(3, "Set2")) %>% - dyAxis("y", label = "Temperature (°C)") %>% - dyRangeSelector() %>% - dyCrosshair(direction = "vertical") %>% - dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE, - highlightSeriesOpts = list(strokeWidth = 2)) #%>% - #dyRoller(rollPeriod = 1) -) \ No newline at end of file diff --git a/codes/download_temperature_toolbox.R b/codes/download_temperature_toolbox.R new file mode 100644 index 0000000..169e9d9 --- /dev/null +++ b/codes/download_temperature_toolbox.R @@ -0,0 +1,136 @@ +#################### eDNA expeditions - scientific analysis #################### +########################## Environmental data download ######################### +# January of 2024 +# Author: Silas C. Principe +# Contact: s.principe@unesco.org +# +######################## Sea temperature from Copernicus ####################### + +# Load packages ---- +library(terra) +library(reticulate) +cm <- import("copernicus_marine_client") + + +# Settings ---- +.user <- rstudioapi::askForPassword("Enter your user") +.pwd <- rstudioapi::askForPassword("Enter your password") + +outfolder <- "data/temperature" +fs::dir_create(outfolder) +filename <- "var=thetao" + + +# Load areas ---- +mhs <- vect("data/shapefiles/marine_world_heritage.gpkg") +# Add an index - we put from 0 to keep consistency with the JupytherHub approaches +mhs$code <- 0:(length(mhs)-1) + + +# Define target dataset, time and depths +dataset <- "cmems_mod_glo_phy_my_0.083deg_P1D-m" +product <- "glorys" +years <- 1992:2021 +depths <- c(0, 25, 50, 75, 100, 150, 200, 250, 500, 1000, 2000) +variables <- list("thetao") +lon_lat_buffer <- 0.5 # in degrees +failed <- c() # To see if any failed + +for (site_idx in mhs$code) { + + sel_site <- mhs[mhs$code == site_idx, ] + + long_range <- ext(sel_site)[1:2] + c(-lon_lat_buffer, lon_lat_buffer) + lat_range <- ext(sel_site)[3:4] + c(-lon_lat_buffer, lon_lat_buffer) + + for (depth in depths) { + + outfile <- paste0(filename, "_site=", site_idx, "_depth=", depth, "_product=", product, ".nc") + + if (!file.exists(paste0(outfolder, "/", outfile))) { + success <- try(cm$subset( + dataset_id = dataset, + variables = variables, + username = .user, + password = .pwd, + minimum_longitude = long_range[1], + maximum_longitude = long_range[2], + minimum_latitude = lat_range[1], + maximum_latitude = lat_range[2], + start_datetime = paste0(min(years), "-01-01T00:00:00"), + end_datetime = paste0(max(years), "-12-31T23:59:59"),#"2022-01-31T23:59:59", + minimum_depth = depth, + maximum_depth = depth+0.5, + output_filename = outfile, + output_directory = outfolder, + force_download = TRUE + ), silent = T) + + # It will rarely fail, but can happen due to server connection problems. + # In those cases, sleep and retry + if (inherits(success, "try-error")) { + cat("Retrying... \n") + Sys.sleep(1) + success <- try(cm$subset( + dataset_id = dataset, + variables = variables, + username = .user, + password = .pwd, + minimum_longitude = long_range[1], + maximum_longitude = long_range[2], + minimum_latitude = lat_range[1], + maximum_latitude = lat_range[2], + start_datetime = paste0(min(years), "-01-01T00:00:00"), + end_datetime = paste0(max(years), "-12-31T23:59:59"),#"2022-01-31T23:59:59", + minimum_depth = depth, + maximum_depth = depth+0.5, + output_filename = outfile, + output_directory = outfolder, + force_download = TRUE + ), silent = T) + if (inherits(success, "try-error")) {failed <- c(failed, outfile)} + } + } else { + cat(glue::glue("File for site {site_idx} depth {depth} already exists. Skipping.\n")) + } + + } + +} + +# Use the code below to explore a dataset +# +# sst_l3s = cm$open_dataset( +# dataset_id = dataset, +# #variables = variables, +# username = .user, +# password = .pwd, +# minimum_longitude = -10, +# maximum_longitude = 10, +# minimum_latitude = -10, +# maximum_latitude = 10 +# ) +# +# # Print loaded dataset information +# print(sst_l3s) + + +# Convert to parquet +proc <- job::job({ + lapply(list.files(outfolder, full.name = T, pattern = "\\.nc"), + function(fname, redo = T) { + + outfile <- gsub("\\.nc", ".parquet", fname) + + if (file.exists(outfile) & !redo) { + cat("File already processed, skipping.\n") + } else { + r <- rast(fname) + r_dat <- as.data.frame(r, xy = TRUE, time = TRUE, wide = FALSE) + r_dat <- subset(r_dat, select = -layer) + arrow::write_parquet(r_dat, outfile, compression = "gzip") + } + + return(invisible(NULL)) + }) +}) diff --git a/codes/get_depthprofiles.R b/codes/get_depthprofiles.R new file mode 100644 index 0000000..1bc785a --- /dev/null +++ b/codes/get_depthprofiles.R @@ -0,0 +1,96 @@ +#################### eDNA expeditions - scientific analysis #################### +########################## Environmental data download ######################### +# January of 2024 +# Author: Silas C. Principe +# Contact: s.principe@unesco.org +# +######################## Define depth profiles for sites ####################### + +# Load packages ---- +library(terra) +library(biooracler) +library(reticulate) +cm <- import("copernicus_marine_client") + + + +# Download bathymetry ---- +fs::dir_create("data/terrain") +bath <- download_layers("terrain_characteristics", "bathymetry_mean", + list(latitude = c(-89.975, 89.975), + longitude = c(-179.975, 179.975)), fmt = "raster", + directory = "data/terrain", + verbose = TRUE) + +# Load sites shapefiles ---- +mhs <- vect("data/shapefiles/marine_world_heritage.gpkg") + +# Get depths ---- +max_depth <- rep(NA, length(mhs)) +max_depth_within <- rep(NA, length(mhs)) +lon_lat_buffer <- 0.5 # in degrees + +for (site_idx in 1:length(mhs)) { + + sel_site <- mhs[site_idx, ] + + site_ext <- ext(sel_site) + + site_ext[1:2] <- site_ext[1:2] + c(-lon_lat_buffer, lon_lat_buffer) + site_ext[3:4] <- site_ext[3:4] + c(-lon_lat_buffer, lon_lat_buffer) + + sel_depth <- crop(bath, site_ext) + + max_depth[site_idx] <- as.numeric(global(sel_depth, min, na.rm = T)) + + max_depth_within[site_idx] <- as.numeric(extract(sel_depth, sel_site, fun = "min", na.rm = T)[,2]) +} + +min(max_depth) +min(max_depth_within, na.rm = T) + +max_depth <- data.frame(max_depth = max_depth, site_id = 1:length(mhs)) +max_depth_within <- data.frame(max_depth_within = max_depth_within, site_id = 1:length(mhs)) + +# Set the depths to be used ---- +depths <- c(0, 25, 50, 75, 100, 150, 200, 250, 500, 1000, 2000) + +# See which depths are being retrieved for each product ---- +temp_depths <- sapply(depths, function(x){ + temp_dataset <- cm$open_dataset( + dataset_id = "cmems_mod_glo_phy_my_0.083deg_P1D-m", + username = .user, + password = .pwd, + minimum_longitude = -10, + maximum_longitude = 10, + minimum_latitude = -10, + maximum_latitude = 10, + minimum_depth = x, + maximum_depth = x+0.5 + ) + temp_dataset$depth$to_dataframe()[,1] +}) + +o2_depths <- sapply(depths, function(x){ + temp_dataset <- cm$open_dataset( + dataset_id = "cmems_mod_glo_bgc_my_0.25_P1D-m", + username = .user, + password = .pwd, + minimum_longitude = -10, + maximum_longitude = 10, + minimum_latitude = -10, + maximum_latitude = 10, + minimum_depth = x, + maximum_depth = x+0.5 + ) + temp_dataset$depth$to_dataframe()[,1] +}) + +# Save object +saveRDS(list( + max_depth_sites = max_depth_within, + max_depth_sites_buff = max_depth, + depths_used = depths, + temperature_depths = temp_depths, + o2_depths = o2_depths +), file = "data/depth_profile.rds") diff --git a/codes/temperature_database.R b/codes/temperature_database.R new file mode 100644 index 0000000..d8900a4 --- /dev/null +++ b/codes/temperature_database.R @@ -0,0 +1,166 @@ +# Load packages ---- +library(dplyr) +library(future) +library(h3jsr) +library(storr) +library(terra) +library(furrr) + +# Define settings ---- +h3_res <- 7 + +# Get H3 indexes for environmental raster ---- +# Get a base raster +f <- list.files("~/Research/mpa_europe/mpaeu_sdm/data/env/current/", + full.names = T) +f <- f[!grepl("json", f)] + +f_st <- f[grepl("thetao_baseline_depthsurf_mean", f)] + +env <- rast(f_st) + +# Convert to data frame (cells and xy) +env_df <- as.data.frame(env, xy = T, cells = T) +env_df <- env_df[,1:3] + +# Open a storr to hold results +st <- storr_rds("data/envh3_storr") + +# Divide in batches to run +env_batches <- split(env_df, as.integer((seq_along(1:nrow(env_df)) - 1) / 300)) +ids <- 1:length(env_batches) + +rm(env_df) + +# Create a function to assign h3 index +to_h3 <- function(obj, idx) { + h3 <- point_to_cell(obj[,c("x", "y")], res = h3_res) + h3_df <- cbind(obj, h3 = h3) + st$set(idx, h3_df) + return(1) +} + +# Map in parallel +plan(multisession, workers = 7) +h3_mapping <- future_map2(env_batches, ids, to_h3, .progress = TRUE) + +# Verify it worked +sum(unlist(h3_mapping)) + +# Remove unnescessary objects +rm(env_batches, h3_mapping) + +# Combine +env_h3 <- st$mget(as.character(1:length(st$list()))) + +env_h3 <- bind_rows(env_h3) + + +# Aggregate environmental information ---- + +# Create a storr to hold results +env_st <- storr_rds("data/envagg_storr/") + +# Divide in batches to run +to_split <- unique(env_h3$h3) +to_split <- split(to_split, + as.integer((seq_along(to_split) - 1) / 200)) +to_split <- lapply(to_split, function(x) data.frame(h3 = x)) +names(to_split) <- 1:length(to_split) +to_split <- bind_rows(to_split, .id = "id") + +env_h3 <- left_join(env_h3, to_split) + +env_batches <- split(env_h3, env_h3$id) +ids <- 1:length(env_batches) + +rm(to_split, env_h3) +gc() + +#all_rast <- rast(f) + +# Create a function to assign h3 index +agg_h3 <- function(obj, idx) { + + all_rast <- rast(f) + + r_vals <- all_rast[obj$cell] + + cols <- gsub("\\.tif", "", basename(f)) + + names(r_vals) <- cols + + agg_vals <- r_vals %>% + mutate(h3 = obj$h3) %>% + group_by(h3) %>% + summarise(across(everything(), ~ mean(.x, na.rm = TRUE))) + + env_st$set(idx, agg_vals) + return(1) +} + +# Run +plan(multisession, workers = 7) +h3_agg <- future_map2(env_batches, ids, agg_h3, .progress = TRUE) + +# Verify it worked +sum(unlist(h3_agg)) + + + +# Add to database ---- +sqlite_file <- "~/Downloads/protectedseas/database.sqlite" +con <- dbConnect(RSQLite::SQLite(), sqlite_file) + +purrr::map(1:length(ids), function(x) { + env_st$get(as.character(x)) %>% + dbWriteTable(con, "env_current", ., append = TRUE) +}, .progress = TRUE) + +dbDisconnect(con) + + +# Test query ---- +con <- dbConnect(RSQLite::SQLite(), sqlite_file) + +site_cells_table <- "site_cells" +gbif_occurrence_table <- "gbif_occurrence" +obis_occurrence_table <- "obis_occurrence" +env_table <- "env_current" +env_vars <- c("thetao_baseline_depthmax_min", + "thetao_baseline_depthmax_mean", + "thetao_baseline_depthmax_max") + +sel_species <- "Platichthys stellatus" + +obis_query <- glue::glue(" + with filtered_data as ( + select species, h3 + from {obis_occurrence_table} + where {obis_occurrence_table}.species = '{sel_species}' + ) + select species, filtered_data.h3, {paste(env_vars, collapse = ', ')} + from filtered_data + inner join env_current on filtered_data.h3 = env_current.h3; + ") + +obis_res <- dbSendQuery(con, obis_query) +obis_species <- dbFetch(obis_res) + +dbDisconnect(con) + +head(obis_species) + +species_points <- sf::st_as_sf(h3jsr::cell_to_point(obis_species$h3)) +species_points <- bind_cols(species_points, obis_species) + +base <- rnaturalearth::ne_countries(returnclass = "sf") + +library(ggplot2) + +ggplot() + + geom_sf(data = base, fill = "grey90") + + geom_sf(data = species_points, aes(color = thetao_baseline_depthmax_mean)) + + scale_color_distiller(name = "thetao mean") + + coord_sf() + + theme_minimal() diff --git a/functions/inspect_cop.py b/functions/inspect_cop.py deleted file mode 100644 index 734fda2..0000000 --- a/functions/inspect_cop.py +++ /dev/null @@ -1,93 +0,0 @@ -############# eDNA Expeditions - UNESCO World Heritage Marine Sites ############ -################# Data analysis of the project - Thermal niche ################# -# August of 2023 -# Authors: Silas C. Principe, Pieter Provoost -# Contact: s.principe@unesco.org -# -############### Inspect satellite dataset from Marine Copernicus ############### -def inspect_cop(dataset, username, password, plot = False, variable = None): - - """Inspect dataset from Copernicus Marine data products - - This function enables to inspect Copernicus satellite data prior to retrieving - the full dataset or part of that. - - Parameters - ---------- - dataset : str - Name of the dataset (e.g. METOFFICE-GLO-SST-L4-REP-OBS-SST) - username : str - Copernicus username - password : str - Copernicus password - plot : bool - Should the function plot the maps during the process? Default is False. - variable : str, optional - Name of the variable (e.g. analyzed_sst). Should be supplied if plot = True. - - Returns - ------- - xarray.Dataset - This object can be supplied to the function retrieve_cop - - Typical usage example: - ------- - user = 'myuser' - pass = 'mypassword' - shp_path = 'path/to/my_shape.shp' - out_path = 'data/saved/' - inspect_cop('METOFFICE-GLO-SST-L4-REP-OBS-SST', 'analyzed_sst', - user, pass) - - Depends on the following packages: - ------- - matplotlib, xarray - You may need to install lxml - """ - - import matplotlib.pyplot as plt - import xarray as xr - - if plot and variable == None: - raise ValueError('When plot=True, variable should be supplied. Check arguments.') - - #! /usr/bin/env python3 - # -*- coding: utf-8 -*- - __author__ = "Copernicus Marine User Support Team" - __copyright__ = "(C) 2022 E.U. Copernicus Marine Service Information" - __credits__ = ["E.U. Copernicus Marine Service Information"] - __license__ = "MIT License - You must cite this source" - __version__ = "202104" - __maintainer__ = "D. Bazin, E. DiMedio, C. Giordan" - __email__ = "servicedesk dot cmems at mercator hyphen ocean dot eu" - - def copernicusmarine_datastore(ds, user, pwd): - from pydap.client import open_url - from pydap.cas.get_cookies import setup_session - cas_url = 'https://cmems-cas.cls.fr/cas/login' - session = setup_session(cas_url, user, pwd) - session.cookies.set("CASTGC", session.cookies.get_dict()['CASTGC']) - database = ['my', 'nrt'] - url = f'https://{database[0]}.cmems-du.eu/thredds/dodsC/{ds}' - try: - data_store = xr.backends.PydapDataStore(open_url(url, session=session)) # needs PyDAP >= v3.3.0 see https://github.com/pydap/pydap/pull/223/commits - except: - url = f'https://{database[1]}.cmems-du.eu/thredds/dodsC/{dataset}' - data_store = xr.backends.PydapDataStore(open_url(url, session=session)) # needs PyDAP >= v3.3.0 see https://github.com/pydap/pydap/pull/223/commits - return data_store - - print('Loading dataset...', flush = True) - data_store = copernicusmarine_datastore(dataset, username, password) - DS = xr.open_dataset(data_store) - - print(DS, flush = True) - - if plot: - sliced = DS[variable].isel(time=1) - if 'depth' in DS.dims: - sliced = sliced.isel(depth=1) - plt.figure() - sliced.plot() - plt.show() - - return DS diff --git a/functions/inspect_ds.py b/functions/inspect_ds.py deleted file mode 100644 index 71d707b..0000000 --- a/functions/inspect_ds.py +++ /dev/null @@ -1,68 +0,0 @@ -############# eDNA Expeditions - UNESCO World Heritage Marine Sites ############ -################# Data analysis of the project - Thermal niche ################# -# August of 2023 -# Authors: Silas C. Principe, Pieter Provoost -# Contact: s.principe@unesco.org -# -################# Inspect satellite dataset from other sources ################# -def inspect_ds(dataset, username, password, plot = False, variable = None): - - """Inspect datasets stored in Zarr/Xarray format - - This function enables to inspect satellite data prior to retrieving - the full dataset or part of that. For Copernicus datasets, use inspect_cop - instead. - - Parameters - ---------- - dataset : str - Link to the dataset (e.g. https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1) - plot : bool - Should the function plot the maps during the process? Default is False. - plot_bbox : str, optional - If ploting, a bounding box to the plot. This is specially relevant for - datasets that are very large (i.e. high-resolution). Should be list on the - format [xmin, xmax, ymin, ymax]. - variable : str, optional - Name of the variable (e.g. analyzed_sst). Should be supplied if plot = True. - - Returns - ------- - xarray.Dataset - This object can be supplied to the function retrieve_cop - - Typical usage example: - ------- - inspect_ds(dataset = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1', - plot = True, plot_bbox = [-90, -30, -42.5, 42.5], - variable = 'analysed_sst') - - Depends on the following packages: - ------- - matplotlib, xarray, zarr - You may need to install lxml, aiohttp, requests - """ - - import matplotlib.pyplot as plt - import xarray as xr - - if plot and variable == None: - raise ValueError('When plot=True, variable should be supplied. Check arguments.') - - print('Loading dataset...', flush = True) - DS = xr.open_dataset(dataset) - - print(DS, flush = True) - - if plot: - sliced = DS[variable].isel(time=1) - sliced = sliced.sel( - lon=slice(plot_bbox[0], plot_bbox[1]), - lat=slice(plot_bbox[2], plot_bbox[3])) - if 'depth' in DS.dims: - sliced = sliced.isel(depth=1) - plt.figure() - sliced.plot() - plt.show() - - return DS diff --git a/functions/retrieve_sst_fun.py b/functions/retrieve_sst_fun.py deleted file mode 100644 index 672b156..0000000 --- a/functions/retrieve_sst_fun.py +++ /dev/null @@ -1,320 +0,0 @@ -############# eDNA Expeditions - UNESCO World Heritage Marine Sites ############ -################# Data analysis of the project - Thermal niche ################# -# August of 2023 -# Authors: Silas C. Principe, Pieter Provoost -# Contact: s.principe@unesco.org -# -################ Retrieve satellite data from Marine Copernicus ################ -def retrieve_cop(dataset, variable, username, password, shape, shape_var, shape_filter, time_range, - ret_type = 'raster', metric = None, group_by_lonlat = False, res_by_month = False, - grouper = 'time', save_path = None, mask_by_shape = True, k_to_celsius = False, plot = False): - - """Retrieve information from Copernicus Marine data products - - Retrieve, select and optionally summarise Copernicus satellite data. This function - also enable to retrieve data from other datasets opened with the function 'inspect_ds'. - In that case, you should supply the result of the function as the dataset argument, - following the instructions on the parameters section. If your dataset have 3 dimensions - (multiple depths), then you should use the retrieve_cop_3d function. - - Parameters - ---------- - dataset : str or list - Name of the dataset (e.g. METOFFICE-GLO-SST-L4-REP-OBS-SST) or a list containing - the name of the dataset and the Xarray dataset object. The second approach is - relevant when you will loop over several areas or make distinct metric operations. - For getting the Xarray dataset, use first the function inspect_cop (or alternatively - inspect_ds for non-Copernicus datasets). NOTE: the list SHOULD be supplied - in that order (i.e. name, dataset). - variable : str - Name of the variable (e.g. analyzed_sst) - username : str - Copernicus username - password : str - Copernicus password - shape : str - Path to shapefile containing regions to mask - shape_var : str - Attribute (column) of the shapefile used to filter (e.g. MRGID) - shape_filter : str or numeric - Code defining the filter on the selected attribute - time_range : list, str - A list with two values defining the time range. Should be on the format - YYYY-MM-DD. E.g. ['2010-01-15', '2010-01-17'] - ret_type : str - Which value to return. There are 2 types available: - 'raster' - a netcdf raster is saved in save_path containing the whole - sliced (and optionally masked) dataset. - 'df' - a data frame like object is returned. Note that depending on the - size of your dataset this may be problematic! Use with care. - metric : str or list, optional - Metric to summarise the results. The metric is usually applied over the layers, - but can also produce a single value by changing the group_by_lonlat argument. Note - that if you set group_by_lonlat = True, then you should also define ret_type to - 'df', otherwise the function will stop. - Metric can be mean, min, max, median, sum, and std. There is one special available - which calculate both the mean and standard deviation: 'meanstd' If you supply - a list of metrics, then all will be calculated and merged in a single file or - data frame. The saved file (if one is saved) will be named "{name}_summarised.nc" - instead of "{name}_{metric}.nc". - group_by_lonlat : bool - If set to True and a metric is chosen, then the summary is applied over - the lon-lat grouped by time (i.e. a single value per time step). The time - step may be grouped differently by changing the 'grouper' argument. - res_by_month : bool - If set to True, daily information is resampled to monthly values (mean of - each month). This option is only relevant if you have daily data. Note that - after resampling, metrics are calculated with the resampled version. - grouper : str - The grouper before calculating the metrics. It is usually 'time', but can - also be 'time.month' to aggregate by month or other valid Xarray value. - If group is done by lon-lat (group_by_lonlat=True) and the grouper is - different than 'time', then the data is first aggregated by the grouper - (e.g. get the month means over all years) before calculating the lon-lat - metric per time step. - save_path : str, optional - Path to save the netcdf if ret_type = 'raster'. Should be on the format - "path/folder/", i.e. with the final "/" - mask_by_shape : bool - If True, then the values of the Xarray are masked by the shape, otherwise - it is used only to crop the Xarray to the bounds of the shape. - k_to_celsius : bool - If True, then the values are converted from Kelvin to Celsius (C = k - 273.15). - Default is False, so the users need to really check if the dataset is in Kelvin. - plot : bool - Should the function plot the maps during the process? Default is False. - - Returns - ------- - saved netcdf - files are saved on path, or: - dataframe - containing the values - - Typical usage example: - ------- - user = 'myuser' - pass = 'mypassword' - shp_path = 'path/to/my_shape.shp' - out_path = 'data/saved/' - retrieve_cop('METOFFICE-GLO-SST-L4-REP-OBS-SST', 'analyzed_sst', - user, pass, shp_path, 'MRGID', '26836', ['2010-01-15', '2010-01-17']) - - Depends on the following packages: - ------- - matplotlib, xarray, numpy, pandas, geopandas and regionmask - For saving netcdf, also netCDF4 - You may need to install lxml - - Possible combinations of parameters: - ------- - Depending on the way you combine the group_by_lonlat, res_by_month and grouper - parameters you can generate a large array of results. - - If you have a daily dataset, spanning several years, you can get monthly maps - of the chosen metric by first using res_by_month = True and then grouper = 'time.month'. - If instead you want the maximum over all months use grouper = 'time'. - - For getting a metric over the area instead, you set group_by_lonlat = True. That - way, the metric is calculated over the pixels (lon-lat coordinates). So, the maximum - will return the pixel with maximum value. Because the way this calculation is done, - grouping by different time steps necessarily involve first aggregating the results by - the refered time step. So using grouper = 'time.year' in this case will make the - function to first get the year means, and then the metric over each year (e.g. - pixel with maximum temperature of that year). - - If you need more flexibility on the calculations you have to do, then the best - way is to first open the dataset using inspect_cop or inspect_ds and then - perform the filterings/combinations/calculations using the returned object. - For that, you must use the xarray functions, which are well described in the - docs: https://docs.xarray.dev/en/latest/index.html - """ - - import matplotlib.pyplot as plt - import xarray as xr - import numpy as np - import pandas as pd - import geopandas as gpd - import regionmask - - if metric != None and group_by_lonlat == True: - if ret_type != 'df': - raise ValueError('When grouping by lonlat, ret_type should be "df". Check arguments.') - - if ret_type != 'raster' and ret_type != 'df': - raise ValueError('ret_type should be one of "raster" or "df"') - - #! /usr/bin/env python3 - # -*- coding: utf-8 -*- - __author__ = "Copernicus Marine User Support Team" - __copyright__ = "(C) 2022 E.U. Copernicus Marine Service Information" - __credits__ = ["E.U. Copernicus Marine Service Information"] - __license__ = "MIT License - You must cite this source" - __version__ = "202104" - __maintainer__ = "D. Bazin, E. DiMedio, C. Giordan" - __email__ = "servicedesk dot cmems at mercator hyphen ocean dot eu" - - def copernicusmarine_datastore(ds, user, pwd): - from pydap.client import open_url - from pydap.cas.get_cookies import setup_session - cas_url = 'https://cmems-cas.cls.fr/cas/login' - session = setup_session(cas_url, user, pwd) - session.cookies.set("CASTGC", session.cookies.get_dict()['CASTGC']) - database = ['my', 'nrt'] - url = f'https://{database[0]}.cmems-du.eu/thredds/dodsC/{ds}' - try: - data_store = xr.backends.PydapDataStore(open_url(url, session=session)) # needs PyDAP >= v3.3.0 see https://github.com/pydap/pydap/pull/223/commits - except: - url = f'https://{database[1]}.cmems-du.eu/thredds/dodsC/{dataset}' - data_store = xr.backends.PydapDataStore(open_url(url, session=session)) # needs PyDAP >= v3.3.0 see https://github.com/pydap/pydap/pull/223/commits - return data_store - - if isinstance(dataset, list): - print('Dataset provided', flush = True) - DS = dataset[1] - dataset = dataset[0] - else: - print('Loading dataset...', flush = True) - data_store = copernicusmarine_datastore(dataset, username, password) - DS = xr.open_dataset(data_store) - - print('Loading shapefile', flush = True) - shape_area = gpd.read_file(shape) - - print('Slicing time and area for region '+shape_var+' '+shape_filter, flush = True) - sel_area = shape_area[getattr(shape_area, shape_var) == str(shape_filter)] - - if plot: - f, ax = plt.subplots() - sel_area.plot(ax=ax) - ax.set(title="MRGID "+str(shape_filter)) - plt.show() - - sel_area_lat = [float(sel_area.total_bounds[1]), float(sel_area.total_bounds[3])] - sel_area_lon = [float(sel_area.total_bounds[0]), float(sel_area.total_bounds[2])] - - if 'longitude' in DS.dims: - DS = DS.rename({'longitude' : 'lon', 'latitude' : 'lat'}) - if 'Longitude' in DS.dims: - DS = DS.rename({'Longitude' : 'lon', 'Latitude' : 'lat'}) - - if max(DS.lon) > 190: - print('Converting from 0/360 to -180/180. Check plot to see if it is ok.') - lon_name = 'lon' - DS['_lon_cor'] = xr.where( - DS[lon_name] > 180, - DS[lon_name] - 360, - DS[lon_name]) - DS = ( - DS - .swap_dims({lon_name: '_lon_cor'}) - .sel(**{'_lon_cor': sorted(DS._lon_cor)}) - .drop(lon_name)) - - DS = DS.rename({'_lon_cor': lon_name}) - - if plot: - to_plot = DS[variable].isel(time=1) - f, ax = plt.subplots() - to_plot.plot(ax=ax) - plt.show() - - start_date = time_range[0] - end_date = time_range[1] - - sliced_data = DS[variable].sel( - time=slice(start_date, end_date), - lon=slice(sel_area_lon[0], sel_area_lon[1]), - lat=slice(sel_area_lat[0], sel_area_lat[1])) - - if k_to_celsius: - sliced_data = sliced_data - 273.15 - - if mask_by_shape: - #area_mask = regionmask.mask_3D_geopandas(sel_area, sliced_data.lon, sliced_data.lat) - area_mask = regionmask.mask_geopandas(sel_area, sliced_data) - - #sliced_data = sliced_data.where(area_mask) - sliced_data = sliced_data.where(area_mask == 0) - - if plot: - sliced_data.plot(col='time', col_wrap=1, figsize=(10, 10)) - plt.show() - - else: - if plot: - sliced_data.plot(col='time', col_wrap=1, figsize=(10, 10)) - plt.show() - - if res_by_month: - print("Resampling (mean by month).", flush = True) - sliced_data = sliced_data.resample(time='1MS').mean('time',keep_attrs=True,skipna=False) - - if metric != None: - print("Calculating metric.", flush = True) - - if group_by_lonlat: - print("Grouping by "+grouper+" and calculating metric over lon-lat.", flush = True) - sliced_data = sliced_data.groupby(grouper) - if grouper != 'time': - sliced_data = sliced_data.mean() - sli_by = ['lat', 'lon'] - else: - sli_by = 'time' - if grouper != 'time': - sliced_data = sliced_data.groupby(grouper) - - if not isinstance(metric, list): - metric = [metric] - - for i in range(len(metric)): - smetric = metric[i] - print('Getting metric '+smetric) - - if smetric == 'mean': - int_data = sliced_data.mean(sli_by).rename('mean') - elif smetric == 'min': - int_data = sliced_data.min(sli_by).rename('min') - elif smetric == 'max': - int_data = sliced_data.max(sli_by).rename('max') - elif smetric == 'median': - int_data = sliced_data.median(sli_by).rename('median') - elif smetric == 'sum': - int_data = sliced_data.sum(sli_by).rename('sum') - elif smetric == 'std': - int_data = sliced_data.std(sli_by).rename('sd') - elif smetric == 'meanstd': - int_data_a = sliced_data.mean(sli_by).rename('mean') - int_data_b = sliced_data.std(sli_by).rename('sd') - int_data = xr.merge([int_data_a, int_data_b]) - else: - print("Metric not found, returning mean and standard deviation.", flush = True) - int_data_a = sliced_data.mean(sli_by).rename('mean') - int_data_b = sliced_data.std(sli_by).rename('sd') - int_data = xr.merge([int_data_a, int_data_b]) - smetric = 'meanstd' - - if i == 0: - final_data = int_data - else: - final_data = xr.merge([final_data, int_data]) - - if len(metric) > 1: - send = "_summarised.nc" - else: - send = "_"+smetric+".nc" - - if ret_type == 'df': - final_data = final_data.to_dataframe() - - if metric == None: - final_data = sliced_data - send = ".nc" - - if ret_type == 'raster': - final_data.to_netcdf(save_path+dataset+"_mrgid"+str(shape_filter)+send) - print("File saved.", flush = True) - return save_path+dataset+"_mrgid"+str(shape_filter)+send - else: - print("Data retrieving done.", flush = True) - return final_data diff --git a/functions/retrieve_sst_fun3d.py b/functions/retrieve_sst_fun3d.py deleted file mode 100644 index 12e6660..0000000 --- a/functions/retrieve_sst_fun3d.py +++ /dev/null @@ -1,342 +0,0 @@ -############# eDNA Expeditions - UNESCO World Heritage Marine Sites ############ -################# Data analysis of the project - Thermal niche ################# -# August of 2023 -# Authors: Silas C. Principe, Pieter Provoost -# Contact: s.principe@unesco.org -# -############## Retrieve satellite data from Marine Copernicus (3D) ############# -def retrieve_cop_3d(dataset, variable, username, password, shape, shape_var, shape_filter, time_range, depth_range, - ret_type = 'raster', agg_depth = False, metric = None, group_by_lonlat = False, res_by_month = False, - grouper = 'time', save_path = None, mask_by_shape = True, k_to_celsius = False, plot = False): - - """Retrieve information from Copernicus Marine data products with 3 dimensions - - Retrieve, select and optionally summarise Copernicus satellite data with a depth - component. This function also enable to retrieve data from other datasets opened - with the function 'inspect_ds'. In that case, you should supply the result of - the function as the dataset argument, following the instructions on the - parameters section. Note: the 'depth' component should be named 'depth' in the - dataset. - - Parameters - ---------- - dataset : str or list - Name of the dataset (e.g. dataset-armor-3d-rep-monthly) or a list containing - the name of the dataset and the Xarray dataset object. The second approach is - relevant when you will loop over several areas or make distinct metric operations. - For getting the Xarray dataset, use first the function inspect_cop (or alternatively - inspect_ds for non-Copernicus datasets). NOTE: the list SHOULD be supplied - in that order (i.e. name, dataset). - variable : str - Name of the variable (e.g. to) - username : str - Copernicus username - password : str - Copernicus password - shape : str - Path to shapefile containing regions to mask - shape_var : str - Attribute (column) of the shapefile used to filter (e.g. MRGID) - shape_filter : str or numeric - Code defining the filter on the selected attribute - time_range : list, str - A list with two values defining the time range. Should be on the format - YYYY-MM-DD. E.g. ['2010-01-15', '2010-01-17'] - depth_range : list, str - A list with two values defining the depth range. Should be on the format - min-max. E.g. [0, 10]. Note that in some cases the depth may be stored - as negative, in which case it should be [-10, 0]. Inspect your dataset - to know which one to use. - ret_type : str - Which value to return. There are 2 types available: - 'raster' - a netcdf raster is saved in save_path containing the whole - sliced (and optionally masked) dataset. - 'df' - a data frame like object is returned. Note that depending on the - size of your dataset this may be problematic! Use with care. - agg_depth : bool - If True, then first the dataset is aggregated over depth (i.e. the mean - over the depths for each time step). Default is False. - metric : str or list, optional - Metric to summarise the results. The metric is usually applied over the layers, - but can also produce a single value by changing the group_by_lonlat argument. Note - that if you set group_by_lonlat = True, then you should also define ret_type to - 'df', otherwise the function will stop. - Metric can be mean, min, max, median, sum, and std. There is one special available - which calculate both the mean and standard deviation: 'meanstd' If you supply - a list of metrics, then all will be calculated and merged in a single file or - data frame. The saved file (if one is saved) will be named "{name}_summarised.nc" - instead of "{name}_{metric}.nc". - group_by_lonlat : bool - If set to True and a metric is chosen, then the summary is applied over - the lon-lat grouped by time (i.e. a single value per time step). The time - step may be grouped differently by changing the 'grouper' argument. In any case, - if the dataset was not aggregated by depth, metrics are calculated for each depth. - res_by_month : bool - If set to True, daily information is resampled to monthly values (mean of - each month). This option is only relevant if you have daily data. Note that - after resampling, metrics are calculated with the resampled version. - grouper : str - The grouper before calculating the metrics. It is usually 'time', but can - also be 'time.month' to aggregate by month or other valid Xarray value. - If group is done by lon-lat (group_by_lonlat=True) and the grouper is - different than 'time', then the data is first aggregated by the grouper - (e.g. get the month means over all years) before calculating the lon-lat - metric per time step. - save_path : str, optional - Path to save the netcdf if ret_type = 'raster'. Should be on the format - "path/folder/", i.e. with the final "/" - mask_by_shape : bool - If True, then the values of the Xarray are masked by the shape, otherwise - it is used only to crop the Xarray to the bounds of the shape. - k_to_celsius : bool - If True, then the values are converted from Kelvin to Celsius (C = k - 273.15). - Default is False, so the users need to really check if the dataset is in Kelvin. - plot : bool - Should the function plot the maps during the process? Default is False. - - Returns - ------- - saved netcdf - files are saved on path, or: - dataframe - containing the values - - Typical usage example: - ------- - user = 'myuser' - pass = 'mypassword' - shp_path = 'path/to/my_shape.shp' - out_path = 'data/saved/' - retrieve_cop('dataset-armor-3d-rep-monthly', 'to', - user, pass, shp_path, 'MRGID', '26836', ['2010-01-15', '2010-01-17'], - [0, 10]) - - Depends on the following packages: - ------- - matplotlib, xarray, numpy, pandas, geopandas and regionmask - For saving netcdf, also netCDF4 - You may need to install lxml - - Possible combinations of parameters: - ------- - Depending on the way you combine the group_by_lonlat, res_by_month and grouper - parameters you can generate a large array of results. - - If you have a daily dataset, spanning several years, you can get monthly maps - of the chosen metric by first using res_by_month = True and then grouper = 'time.month'. - If instead you want the maximum over all months use grouper = 'time'. - - For getting a metric over the area instead, you set group_by_lonlat = True. That - way, the metric is calculated over the pixels (lon-lat coordinates). So, the maximum - will return the pixel with maximum value. Because the way this calculation is done, - grouping by different time steps necessarily involve first aggregating the results by - the refered time step. So using grouper = 'time.year' in this case will make the - function to first get the year means, and then the metric over each year (e.g. - pixel with maximum temperature of that year). - - If you need more flexibility on the calculations you have to do, then the best - way is to first open the dataset using inspect_cop or inspect_ds and then - perform the filterings/combinations/calculations using the returned object. - For that, you must use the xarray functions, which are well described in the - docs: https://docs.xarray.dev/en/latest/index.html - """ - - import matplotlib.pyplot as plt - import xarray as xr - import numpy as np - import pandas as pd - import geopandas as gpd - import regionmask - - if metric != None and group_by_lonlat == True: - if ret_type != 'df': - raise ValueError('When grouping by lonlat, ret_type should be "df". Check arguments.') - - if ret_type != 'raster' and ret_type != 'df': - raise ValueError('ret_type should be one of "raster" or "df"') - - #! /usr/bin/env python3 - # -*- coding: utf-8 -*- - __author__ = "Copernicus Marine User Support Team" - __copyright__ = "(C) 2022 E.U. Copernicus Marine Service Information" - __credits__ = ["E.U. Copernicus Marine Service Information"] - __license__ = "MIT License - You must cite this source" - __version__ = "202104" - __maintainer__ = "D. Bazin, E. DiMedio, C. Giordan" - __email__ = "servicedesk dot cmems at mercator hyphen ocean dot eu" - - def copernicusmarine_datastore(ds, user, pwd): - from pydap.client import open_url - from pydap.cas.get_cookies import setup_session - cas_url = 'https://cmems-cas.cls.fr/cas/login' - session = setup_session(cas_url, user, pwd) - session.cookies.set("CASTGC", session.cookies.get_dict()['CASTGC']) - database = ['my', 'nrt'] - url = f'https://{database[0]}.cmems-du.eu/thredds/dodsC/{ds}' - try: - data_store = xr.backends.PydapDataStore(open_url(url, session=session)) # needs PyDAP >= v3.3.0 see https://github.com/pydap/pydap/pull/223/commits - except: - url = f'https://{database[1]}.cmems-du.eu/thredds/dodsC/{dataset}' - data_store = xr.backends.PydapDataStore(open_url(url, session=session)) # needs PyDAP >= v3.3.0 see https://github.com/pydap/pydap/pull/223/commits - return data_store - - if isinstance(dataset, list): - print('Dataset provided', flush = True) - DS = dataset[1] - dataset = dataset[0] - else: - print('Loading dataset...', flush = True) - data_store = copernicusmarine_datastore(dataset, username, password) - DS = xr.open_dataset(data_store) - - print('Loading shapefile', flush = True) - shape_area = gpd.read_file(shape) - - print('Slicing time and area for region '+shape_var+' '+shape_filter, flush = True) - sel_area = shape_area[getattr(shape_area, shape_var) == str(shape_filter)] - - if plot: - f, ax = plt.subplots() - sel_area.plot(ax=ax) - ax.set(title="MRGID "+str(shape_filter)) - plt.show() - - sel_area_lat = [float(sel_area.total_bounds[1]), float(sel_area.total_bounds[3])] - sel_area_lon = [float(sel_area.total_bounds[0]), float(sel_area.total_bounds[2])] - - if DS.attrs["easternmost_longitude"] > 190: - sel_area_lon[0] = sel_area_lon[0] + 360 - sel_area_lon[1] = sel_area_lon[1] + 360 - - start_date = time_range[0] - end_date = time_range[1] - - start_depth = depth_range[0] - end_depth = depth_range[1] - - if 'longitude' in DS.dims: - DS = DS.rename({'longitude' : 'lon', 'latitude' : 'lat'}) - if 'Longitude' in DS.dims: - DS = DS.rename({'Longitude' : 'lon', 'Latitude' : 'lat'}) - - if max(DS.lon) > 190: - print('Converting from 0/360 to -180/180. Check plot to see if it is ok.') - lon_name = 'lon' - DS['_lon_cor'] = xr.where( - DS[lon_name] > 180, - DS[lon_name] - 360, - DS[lon_name]) - DS = ( - DS - .swap_dims({lon_name: '_lon_cor'}) - .sel(**{'_lon_cor': sorted(DS._lon_cor)}) - .drop(lon_name)) - - DS = DS.rename({'_lon_cor': lon_name}) - - if plot: - to_plot = DS[variable].isel(time=1) - f, ax = plt.subplots() - to_plot.plot(ax=ax) - plt.show() - - sliced_data = DS[variable].sel( - depth=slice(start_depth, end_depth), - time=slice(start_date, end_date), - lon=slice(sel_area_lon[0], sel_area_lon[1]), - lat=slice(sel_area_lat[0], sel_area_lat[1])) - - if k_to_celsius: - sliced_data = sliced_data - 273.15 - - if mask_by_shape: - area_mask = regionmask.mask_3D_geopandas(sel_area, sliced_data.lon, sliced_data.lat) - #area_mask = regionmask.mask_geopandas(sel_area, sliced_data) - - sliced_data = sliced_data.where(area_mask) - #sliced_data = sliced_data.where(area_mask == 0) - - if plot: - sliced_data.isel(time=1, depth=1).plot(figsize=(10, 10)) - plt.show() - - else: - if plot: - sliced_data.isel(time=1, depth=1).plot(figsize=(10, 10)) - plt.show() - - if res_by_month: - print("Resampling (mean by month).", flush = True) - sliced_data = sliced_data.resample(time='1MS').mean('time',keep_attrs=True,skipna=False) - - if agg_depth: - sliced_data = sliced_data.mean('depth') - - if metric != None: - print("Calculating metric.", flush = True) - - if group_by_lonlat: - print("Grouping by "+grouper+" and calculating metric over lon-lat.", flush = True) - sliced_data = sliced_data.groupby(grouper) - if grouper != 'time': - sliced_data = sliced_data.mean() - sli_by = ['lat', 'lon'] - else: - sli_by = 'time' - if grouper != 'time': - sliced_data = sliced_data.groupby(grouper) - - if not isinstance(metric, list): - metric = [metric] - - for i in range(len(metric)): - smetric = metric[i] - print('Getting metric '+smetric) - - if smetric == 'mean': - int_data = sliced_data.mean(sli_by).rename('mean') - elif smetric == 'min': - int_data = sliced_data.min(sli_by).rename('min') - elif smetric == 'max': - int_data = sliced_data.max(sli_by).rename('max') - elif smetric == 'median': - int_data = sliced_data.median(sli_by).rename('median') - elif smetric == 'sum': - int_data = sliced_data.sum(sli_by).rename('sum') - elif smetric == 'std': - int_data = sliced_data.std(sli_by).rename('sd') - elif smetric == 'meanstd': - int_data_a = sliced_data.mean(sli_by).rename('mean') - int_data_b = sliced_data.std(sli_by).rename('sd') - int_data = xr.merge([int_data_a, int_data_b]) - else: - print("Metric not found, returning mean and standard deviation.", flush = True) - int_data_a = sliced_data.mean(sli_by).rename('mean') - int_data_b = sliced_data.std(sli_by).rename('sd') - int_data = xr.merge([int_data_a, int_data_b]) - smetric = 'meanstd' - - if i == 0: - final_data = int_data - else: - final_data = xr.merge([final_data, int_data]) - - if len(metric) > 1: - send = "_summarised.nc" - else: - send = "_"+smetric+".nc" - - if ret_type == 'df': - final_data = final_data.to_dataframe() - - if metric == None: - final_data = sliced_data - send = ".nc" - - if ret_type == 'raster': - final_data.to_netcdf(save_path+dataset+"_mrgid"+str(shape_filter)+send) - print("File saved.", flush = True) - return save_path+dataset+"_mrgid"+str(shape_filter)+send - else: - print("Data retrieving done.", flush = True) - return final_data