From 6d819e1d0af6a35f8c685294e5edbc30654aae90 Mon Sep 17 00:00:00 2001
From: silasprincipe <53846571+silasprincipe@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:15:08 +0100
Subject: [PATCH] Update codes for glossy report

Reflect new lists on nov2024
---
 codes/get_sst_summaries.R    | 265 +++++++++++++++++++++++++++++
 codes/nearby_from_nc.R       | 188 +++++++++++++++++++++
 codes/nearby_from_nc_terra.R |  70 ++++++++
 codes/occurrence.R           |  56 +++++++
 codes/plot_paf_newlist.R     | 315 +++++++++++++++++++++++++++++++++++
 5 files changed, 894 insertions(+)
 create mode 100644 codes/get_sst_summaries.R
 create mode 100644 codes/nearby_from_nc.R
 create mode 100644 codes/nearby_from_nc_terra.R
 create mode 100644 codes/occurrence.R
 create mode 100644 codes/plot_paf_newlist.R

diff --git a/codes/get_sst_summaries.R b/codes/get_sst_summaries.R
new file mode 100644
index 0000000..b5cf16c
--- /dev/null
+++ b/codes/get_sst_summaries.R
@@ -0,0 +1,265 @@
+#################### eDNA expeditions - scientific analysis ####################
+########################## Environmental data download #########################
+# November of 2024
+# Author: Silas C. Principe
+# Contact: s.principe@unesco.org
+#
+############### Retrieve SST summaries for species and sites ###################
+
+# Dependencies note
+# Part of this code can be executed with Python, using Xarray and Dask
+# You need to install those libraries
+# using reticulate::py_install()
+# To use that option, you can source the code "nearby_from_nc.R"
+# Here we use the R terra package instead. The functions syntax are exactly the same
+
+
+# Load packages -----
+library(arrow)
+library(dplyr)
+library(tidyr)
+library(biooracler)
+library(obissdm)
+library(h3jsr)
+library(terra)
+
+
+
+# Settings -----
+datafolder <- "data"
+fs::dir_create(datafolder)
+outfolder <- "results"
+
+
+
+# Download data ------
+# Species grids
+gridf <- file.path(datafolder, 'speciesgrids')
+fs::dir_create(gridf)
+system(glue::glue("aws s3 cp --recursive s3://obis-products/speciesgrids/h3_7 {gridf} --no-sign-request"))
+
+# Environmental data
+datasets <- c(
+  "thetao_baseline_2000_2019_depthsurf"
+)
+
+datasets <- c(datasets,
+              gsub("depthsurf", "depthmean", datasets))
+
+future_scenarios <- c("ssp126", "ssp245", "ssp370", "ssp585")
+
+time_steps <- list(
+  current = c("2000-01-01T00:00:00Z", "2010-01-01T00:00:00Z"), #2000-2010/2010-2020
+  dec50 = c("2030-01-01", "2040-01-01"), #2030-2040/2040-2050
+  dec100 = c("2080-01-01", "2090-01-01") #2080-2090/2090-2100
+)
+
+variables <- c("mean")
+
+get_env_data(datasets = datasets, future_scenarios = future_scenarios,
+             time_steps = time_steps, variables = variables,
+             terrain_vars = NULL, average_time = T, outdir = datafolder)
+
+
+
+# Prepare environmental layers -------
+fpaths <- file.path(datafolder, paste0(
+    "future/", future_scenarios, "/thetao_", future_scenarios, "_depthsurf_mean.tif"
+))
+all_paths <- c(
+    file.path(datafolder, "current/thetao_baseline_depthsurf_mean.tif"),
+    gsub("mean", "dec50_mean", fpaths),
+    gsub("mean", "dec100_mean", fpaths)
+)
+surface <- rast(all_paths)
+bottom <- rast(gsub("depthsurf", "depthmean", all_paths))
+
+names(surface) <- gsub("thetao_", "", gsub(".tif", "", basename(sources(surface))))
+names(bottom) <- gsub("thetao_", "", gsub(".tif", "", basename(sources(bottom))))
+
+# Save as a single netcdf, as this will be used by the R or Python functions
+writeCDF(sds(surface[[1]], bottom[[1]]), file.path(datafolder, "biooracle_data.nc"),
+         overwrite = T)
+
+
+# Get list of species ------
+source("codes/occurrence.R")
+download_occurrence()
+
+occ <- read_occurrence_data()
+
+occ <- occ %>%
+    filter(taxonRank == "species") %>%
+    select(scientificName, higherGeography) %>%
+    group_by(scientificName, higherGeography) %>%
+    distinct(scientificName, .keep_all = T) %>%
+    ungroup()
+
+species_list <- occ %>% distinct(scientificName)
+
+
+
+# Extract data and get nearby for those invalid -------
+source("codes/nearby_from_nc_terra.R")
+
+# Uncomment and use lines below to use Python version
+# library(reticulate)
+# source("codes/nearby_from_nc.R")
+# da <- import("dask")
+# dd <- import("dask.distributed")
+# client <- dd$Client()
+# browseURL("http://localhost:8787/status")
+# xr <- import("xarray")
+
+target_nc <- file.path(datafolder, "biooracle_data.nc")
+
+quantile_df <- function(x, probs = c(0.05, 0.95)) {
+  res <- tibble(metric = c(paste0("q_", probs), "mean", "sd", "no_data", "with_data"), 
+                value = c(quantile(x, probs, na.rm = T),
+                          mean(x, na.rm = T),
+                          sd(x, na.rm = T),
+                          sum(is.na(x)),
+                          sum(!is.na(x))))
+  res
+}
+
+grids_ds <- open_dataset(gridf)
+
+grids_ds <- grids_ds %>%
+    select(species, AphiaID, min_year, cell)
+
+number_recs <- grids_ds %>%
+    filter(species %in% species_list$scientificName) %>%
+    group_by(species) %>%
+    count() %>%
+    collect()
+
+number_recs <- number_recs %>%
+    ungroup() %>%
+    mutate(cumulative_sum = cumsum(n),
+         batch = cumsum(cumulative_sum >= lag(cumulative_sum, default = 0) + 100000)) %>%
+    select(-cumulative_sum)
+
+number_recs$batch <- number_recs$batch + 1
+
+species_batch <- unique(number_recs$batch)
+results_batch <- lapply(seq_along(species_batch), function(x) NULL)
+
+for (spi in species_batch) {
+    cat("\nRunning batch", spi, "out of", length(species_batch), "\n\n")
+    sel_species <- number_recs$species[number_recs$batch == spi]
+
+    sel_data <- grids_ds %>% 
+        filter(species %in% sel_species) %>%
+        collect()
+
+    sel_data_crds <- as.data.frame(sf::st_coordinates(h3jsr::cell_to_point(sel_data$cell)))
+    sel_data_crds <- bind_cols(sel_data_crds, sel_data)
+    colnames(sel_data_crds)[1:2] <- c("decimalLongitude", "decimalLatitude")
+
+    grids_result <- extract_from_nc(target_nc, sel_data_crds[,1:2])
+    var_col <- which(grepl("thetao", colnames(grids_result)))[1]
+
+    grids_result <- bind_cols(grids_result, sel_data)
+
+    coords_na <- which(is.na(grids_result[, var_col]))
+
+    if (length(coords_na) > 1) {
+        new_coords <- get_nearby(target_nc, names(grids_result)[var_col],
+                                grids_result[coords_na, 1:2], mode = "25")
+        na_approx <- which(!is.na(new_coords[, "value"]))
+        new_coords$decimalLongitude[na_approx] <- new_coords$new_lon[na_approx]
+        new_coords$decimalLatitude[na_approx] <- new_coords$new_lat[na_approx]
+
+        new_info <- grids_result[coords_na, colnames(sel_data)]
+        grids_result <- grids_result[-coords_na,]
+        new_result <- extract_from_nc(target_nc, new_coords[,1:2])
+        grids_result <- bind_rows(grids_result, bind_cols(new_result, new_info))
+    }
+
+    grids_result <- grids_result %>%
+        select(cell, species, starts_with("thetao")) %>%
+        pivot_longer(starts_with("thetao"), names_to = "variable", values_to = "values") %>%
+        group_by(variable, species) %>%
+        distinct(cell, .keep_all = T) %>%
+        reframe(quantile_df(values)) %>%
+        separate_wider_delim(cols = "variable",
+                            names = c("variable", "baseline", "depth", "variant"),
+                            delim = "_") %>%
+        select(-variable, -variant, scenario = baseline)
+
+    results_batch[[spi]] <- grids_result
+}
+
+results_batch <- do.call("rbind", results_batch)
+
+write_parquet(results_batch, file.path(datafolder, "species_thermal_lims.parquet"))
+
+
+
+# Get temperature on sites on all scenarios ------
+# sites <- jsonlite::read_json("https://raw.githubusercontent.com/iobis/edna-tracker-data/data/generated.json")
+# sites_samples <- sites$samples %>% bind_rows()
+sites_shape <- sf::read_sf("https://samples.ednaexpeditions.org/sites.geojson")
+sites_shape <- terra::vect(sites_shape)
+
+sites_values <- terra::extract(c(surface, bottom), sites_shape, fun = mean, na.rm = T)
+sites_values$higherGeography <- sites_shape$name
+sites_values <- sites_values %>% filter(!is.na(baseline_depthsurf_mean))
+
+sites_values$higherGeography[sites_values$higherGeography == "Península Valdés"] <- "Peninsula Valdès"
+sites_values$higherGeography[sites_values$higherGeography == "The Wadden Sea"] <- "Wadden Sea"
+
+write_parquet(sites_values, file.path(datafolder, "sites_sst_all.parquet"))
+
+species_site_temp <- left_join(occ, sites_values) %>% select(-ID)
+
+species_lims <- results_batch %>% 
+    select(species, metric, value, scenario, depth) %>%
+    pivot_wider(names_from = metric, values_from = value, id_cols = c("species", "scenario", "depth"))
+
+species_lims_dsurf <- species_lims %>%
+    filter(depth == "depthsurf") %>%
+    rename(scientificName = species)
+species_lims_dmean <- species_lims %>%
+    filter(depth == "depthmean") %>%
+    rename(scientificName = species)
+
+colnames(species_lims_dsurf)[4:9] <- paste0("dsurf_", colnames(species_lims_dsurf)[4:9])
+colnames(species_lims_dmean)[4:9] <- paste0("dmean_", colnames(species_lims_dmean)[4:9])
+
+species_site_temp <- left_join(species_site_temp, species_lims_dsurf[,c("scientificName", "dsurf_q_0.95")])
+species_site_temp <- left_join(species_site_temp, species_lims_dmean[,c("scientificName", "dmean_q_0.95")])
+
+sites_species_risk <- species_site_temp %>%
+    mutate(across(contains("depthsurf"), ~ .x - dsurf_q_0.95)) %>%
+    mutate(across(contains("depthmean"), ~ .x - dmean_q_0.95)) %>%
+    rename(limit_dsurf = dsurf_q_0.95, limit_dmean = dmean_q_0.95) %>%
+    mutate(across(3:length(.), ~round(.x, 2)))
+
+# View results
+sites_species_risk %>%
+    filter(higherGeography == 'Aldabra Atoll') %>%
+    select(scientificName, 
+     current = baseline_depthsurf_mean,
+     ssp1 = ssp126_depthsurf_dec100_mean,
+     ssp2 = ssp245_depthsurf_dec100_mean,
+     ssp3 = ssp370_depthsurf_dec100_mean,
+     ssp5 = ssp585_depthsurf_dec100_mean)
+
+write_parquet(sites_species_risk, file.path(datafolder, "species_sites_risk.parquet"))
+
+# Example application
+sites_species_risk %>%
+    filter(higherGeography == 'Aldabra Atoll') %>%
+    select(scientificName, 
+     current = baseline_depthsurf_mean,
+     ssp1 = ssp126_depthsurf_dec100_mean,
+     ssp2 = ssp245_depthsurf_dec100_mean,
+     ssp3 = ssp370_depthsurf_dec100_mean) %>%
+     mutate(across(2:length(.), ~ ifelse(.x <= 0, 0, 1))) %>%
+     summarise(current = sum(current, na.rm = T), ssp1 = sum(ssp1, na.rm = T), ssp2 = sum(ssp2, na.rm = T), ssp3 = sum(ssp3, na.rm = T)) %>%
+     pivot_longer(1:4, names_to = "scenario", values_to = "species") %>%
+     ggplot2::ggplot() +
+        ggplot2::geom_bar(ggplot2::aes(x = scenario, y = species), stat = "identity")
+
diff --git a/codes/nearby_from_nc.R b/codes/nearby_from_nc.R
new file mode 100644
index 0000000..52c77de
--- /dev/null
+++ b/codes/nearby_from_nc.R
@@ -0,0 +1,188 @@
+extract_from_nc <- function(netcdf, coordinates, variable = NULL) {
+    if (!exists("xr")) {
+        stop("Xarray is not loaded. Load it using `xr <- reticulate::import('xarray')`")
+    }
+
+    ds <- xr$open_dataset(netcdf, chunks = "auto")
+
+    if (!is.null(variable)) {
+        ds <- ds[variable]
+    }
+
+    nams_coords <- unlist(ds$coords$dims)
+
+    lons <- coordinates$decimalLongitude
+    lats <- coordinates$decimalLatitude
+
+    if (length(lons) < 2) {
+        lons <- list(lons)
+        lats <- list(lats)
+    }
+
+    lats <- xr$DataArray(lats, dims = "z")
+    lons <- xr$DataArray(lons, dims = "z")
+
+    if (any(grepl("latitude", nams_coords))) {
+        temp_res <- ds$sel(latitude = lats, longitude = lons, method = "nearest")
+    } else {
+        temp_res <- ds$sel(lat = lats, lon = lons, method = "nearest")
+    }
+
+    results <- temp_res$to_dataframe()
+
+    results_final <- cbind(coordinates, results)
+    results_final <- apply(results_final, 2, function(x) ifelse(is.nan(x), NA, x))
+
+    return(as.data.frame(results_final))
+}
+
+
+get_nearby <- function(netcdf, variable, coordinates, mode = "queen", verbose = TRUE) {
+
+    if (verbose) cat("Getting nearby valid cells for", nrow(coordinates), "records\n")
+
+    if (mode == "queen") {
+        fadj = 1
+    } else {
+        fadj = 2
+    }
+
+    ds <- xr$open_dataset(netcdf, chunks = "auto")
+    ds <- ds[variable]
+
+    nams_coords <- unlist(ds$coords$dims)
+
+    if ("longitude" %in% nams_coords) {
+        layer_x <- ds$longitude$to_dataframe()[,"longitude"]
+        layer_y <- ds$latitude$to_dataframe()[,"latitude"]
+    } else {
+        layer_x <- ds$lon$to_dataframe()[,"lon"]
+        layer_y <- ds$lat$to_dataframe()[,"lat"]
+    }
+    lim_x <- range(seq_along(layer_x))
+    lim_y <- range(seq_along(layer_y))
+
+    coordinates_idx <- coordinates
+
+    lons <- coordinates$decimalLongitude
+    lats <- coordinates$decimalLatitude
+
+    if (length(lons) < 2) {
+        lons <- list(lons)
+        lats <- list(lats)
+    }
+
+    lons <- xr$DataArray(lons, dims = "z")
+    lats <- xr$DataArray(lats, dims = "z")
+
+    if (any(grepl("latitude", nams_coords))) {
+        temp_res <- ds$sel(latitude = lats, longitude = lons, method = "nearest")
+    } else {
+        temp_res <- ds$sel(lat = lats, lon = lons, method = "nearest")
+    }
+
+    temp_res <- temp_res$to_dataframe()
+
+    extra_coords <- lapply(seq_len(nrow(temp_res)), function(x) NULL)
+
+    if (verbose && nrow(coordinates) > 1) {
+        pb <- progress::progress_bar$new(total = nrow(temp_res))
+        pbs <- TRUE
+    } else {
+        pbs <- FALSE
+    }
+
+    for (id in 1:nrow(coordinates)) {
+        if (pbs) pb$tick()
+        
+        id_dim <- temp_res[id,]
+
+        adj_x <- id_dim[,grep("lon", colnames(id_dim))]
+        adj_y <- id_dim[,grep("lat", colnames(id_dim))]
+
+        adj_x <- which(adj_x == layer_x)
+        adj_y <- which(adj_y == layer_y)
+
+        adj_x <- seq(adj_x - fadj, adj_x + fadj)
+        adj_y <- seq(adj_y - fadj, adj_y + fadj)
+
+        adj_x[adj_x < lim_x[1]] <- lim_x[2] + adj_x[adj_x < lim_x[1]]
+        adj_x[adj_x > lim_x[2]] <- adj_x[adj_x > lim_x[2]] - lim_x[2]
+
+        adj_x <- adj_x[adj_x >= lim_x[1] & adj_x <= lim_x[2]]
+        adj_y <- adj_y[adj_y >= lim_y[1] & adj_y <= lim_y[2]]
+
+        vals_grid <- expand.grid(x = adj_x, y = adj_y)
+        vals_grid$value <- vals_grid$cy <- vals_grid$cx <- NA
+        vals_grid$ID <- id
+        extra_coords[[id]] <- vals_grid
+    }
+
+    if (verbose) cat("Processing adjacent points...\n")
+    extra_coords <- do.call("rbind", extra_coords)
+
+    x_ids <- xr$DataArray(as.integer(extra_coords$x - 1), dims = "z")
+    y_ids <- xr$DataArray(as.integer(extra_coords$y - 1), dims = "z")
+
+    if ("longitude" %in% nams_coords) {
+        vg_pt <- ds$isel(
+            longitude = x_ids,
+            latitude = y_ids
+        )
+        vg_pt <- vg_pt$to_dataframe()
+    } else {
+        vg_pt <- ds$isel(
+            lon = x_ids,
+            lat = y_ids
+        )
+        vg_pt <- vg_pt$to_dataframe()
+    }
+
+    if (nrow(extra_coords) != nrow(vg_pt)) stop("Error on data extraction.")
+
+    extra_coords$cx <- vg_pt[,grep("lon", colnames(vg_pt))]
+    extra_coords$cy <- vg_pt[,grep("lat", colnames(vg_pt))]
+    extra_coords$value <- vg_pt[[variable]]
+    extra_coords$value[is.nan(extra_coords$value)] <- NA
+
+    coordinates_idx$ID <- seq_len(nrow(coordinates_idx))
+    extra_coords <- dplyr::left_join(extra_coords,
+                                     coordinates_idx[,c("decimalLongitude", "decimalLatitude", "ID")], by = "ID")
+
+    tfun <- function(cx, cy, value, decimalLongitude, decimalLatitude) {
+        if (all(is.na(value))) {
+            df <- data.frame(new_lon = NA, new_lat = NA, value = NA)
+        } else {
+            if (sum(!is.na(value)) == 1) {
+                df <- data.frame(new_lon = cx[!is.na(value)],
+                                 new_lat = cy[!is.na(value)],
+                                 value = value[!is.na(value)])
+            } else {
+                cx <- cx[!is.na(value)]
+                cy <- cy[!is.na(value)]
+                value <- value[!is.na(value)]
+
+                true_x <- decimalLongitude[1]
+                true_y <- decimalLatitude[1]
+                diff <- abs(cy - true_y) + abs(cx - true_x)
+                diff[diff >= 360] <- diff[diff >= 360] - 360
+
+                df <- data.frame(new_lon = cx[order(diff)][1],
+                                 new_lat = cy[order(diff)][1],
+                                 value = value[order(diff)][1])
+            }
+        }
+        return(dplyr::as_tibble(df))
+    }
+    
+    extra_coords <- extra_coords %>%
+        group_by(ID) %>%
+        summarise(tfun(cx, cy, value, decimalLongitude, decimalLatitude))
+
+    coordinates_idx <- bind_cols(
+        coordinates_idx[,c("decimalLongitude", "decimalLatitude")],
+        extra_coords[,c("value", "new_lon", "new_lat", "ID")]
+    )
+
+    return(coordinates_idx)
+}
diff --git a/codes/nearby_from_nc_terra.R b/codes/nearby_from_nc_terra.R
new file mode 100644
index 0000000..f9e13ff
--- /dev/null
+++ b/codes/nearby_from_nc_terra.R
@@ -0,0 +1,70 @@
+extract_from_nc <- function(netcdf, coordinates, variable = NULL) {
+
+    layer <- terra::rast(netcdf)
+
+    results <- terra::extract(layer, coordinates, ID = F)
+
+    results_final <- cbind(coordinates, results)
+
+    return(as.data.frame(results_final))
+}
+
+
+get_nearby <- function(netcdf, variable, coordinates, mode = "queen", verbose = TRUE) {
+
+    if (verbose) cat("Getting nearby valid cells for", nrow(coordinates), "records\n")
+
+    if (mode == "queen") {
+        fadj = 4
+    } else {
+        fadj = matrix(nrow = 5, ncol = 5)
+        fadj[] <- c(rep(1, 12), 0, rep(1, 12))
+    }
+
+    layer <- terra::rast(netcdf)
+    layer <- terra::subset(layer, variable)
+
+    adj_cells <- terra::adjacent(layer, terra::cellFromXY(layer, coordinates),
+                    directions = fadj)
+    adj_cells <- as.data.frame(adj_cells)
+
+    adj_cells$ID <- seq_len(nrow(adj_cells))
+    adj_cells <- adj_cells |>
+        tidyr::pivot_longer(1:(ncol(adj_cells)-1), names_to = "index", values_to = "cell") |>
+        dplyr::select(-index)
+
+    xy_adj <- terra::extract(layer[[1]], adj_cells$cell, xy = T)
+    xy_adj <- cbind(xy_adj, adj_cells)
+
+    adj_cells_result <- tapply(seq_len(nrow(xy_adj)), xy_adj$ID, function(idx){
+        target <- xy_adj[idx,]
+        values_c <- target[,3]
+        id <- target$ID[1]
+        if (all(is.na(values_c))) {
+            to_return <- data.frame(new_lon = coordinates$decimalLongitude[id],
+                                    new_lat = coordinates$decimalLatitude[id],
+                                    value = NA)
+        } else if (sum(!is.na(values_c)) == 1) {
+            to_return <- target[!is.na(values_c),1:3]
+            colnames(to_return) <- c("new_lon", "new_lat", "value")
+        } else {
+            focalcell <- coordinates[id,1:2]
+            possible <- target[!is.na(values_c),1:3]
+            true_x <- focalcell[1,1]
+            true_y <- focalcell[1,2]
+            diff <- abs(possible$y - true_y) + abs(possible$x - true_x)
+            diff[diff >= 360] <- diff[diff >= 360] - 360
+            to_return <- possible[order(diff)[1],]
+            colnames(to_return) <- c("new_lon", "new_lat", "value")
+        }
+        return(to_return)
+    })
+    adj_cells_result <- do.call("rbind", adj_cells_result)
+
+    coordinates_idx <- bind_cols(
+        coordinates[,c("decimalLongitude", "decimalLatitude")],
+        adj_cells_result[,c("value", "new_lon", "new_lat")]
+    )
+
+    return(coordinates_idx)
+}
diff --git a/codes/occurrence.R b/codes/occurrence.R
new file mode 100644
index 0000000..5e6e16d
--- /dev/null
+++ b/codes/occurrence.R
@@ -0,0 +1,56 @@
+library(dplyr)
+library(stringr)
+library(purrr)
+library(arrow)
+
+#' Download occurrence data
+download_occurrence <- function(force = FALSE) {
+  if (!dir.exists("data/output") || force) {
+    cat("Downloading occurrence data ======\n")
+    system("rm -r output output.zip")
+    download.file(
+        "https://obis-edna-results.s3.amazonaws.com/output.zip",
+        "output.zip"
+    )
+    unzip("output.zip", exdir = "data/")
+    file.remove("output.zip")
+  } else {
+    cat("Occurrence data already downloaded ======\n")
+  }
+}
+
+#' Reads the full occurrence dataset from TSV.
+read_occurrence_tsv <- function() {
+  occurrence_files <- list.files("data/output", "*Occurrence*", full.names = TRUE)
+  
+  dna_files <- list.files("data/output", "*DNADerivedData*", full.names = TRUE)
+  
+  dna <- map(dna_files, read.table, sep = "\t", quote = "", header = TRUE) %>%
+    bind_rows() %>%
+    mutate_if(is.character, na_if, "")
+
+  occurrence <- map(occurrence_files, read.table, sep = "\t", quote = "", header = TRUE, comment.char = "") %>%
+    bind_rows() %>%
+    mutate_if(is.character, na_if, "")%>%
+    left_join(dna, by = "occurrenceID")
+
+  return(occurrence)
+}
+
+#' Converts the occurrence dataset to parquet.
+convert_occurrence_data <- function() {
+  message("Converting TSV to parquet...")
+  occurrence <- read_occurrence_tsv()
+  write_parquet(occurrence, "data/output/occurrence.parquet")
+  return(occurrence)
+}
+
+#' Reads the full occurrence dataset from parquet, converts TSV to parquet if the parquet file does not exist.
+read_occurrence_data <- function() {
+  if (!file.exists("data/output/occurrence.parquet")) {
+    occurrence <- convert_occurrence_data()  
+  } else {
+    occurrence <- read_parquet("data/output/occurrence.parquet")
+  }
+  return(occurrence)
+}
diff --git a/codes/plot_paf_newlist.R b/codes/plot_paf_newlist.R
new file mode 100644
index 0000000..9fccadd
--- /dev/null
+++ b/codes/plot_paf_newlist.R
@@ -0,0 +1,315 @@
+#################### eDNA expeditions - scientific analysis ####################
+########################## Environmental data download #########################
+# January of 2024
+# Author: Silas Principe, Mike Burrows
+# Contact: s.principe@unesco.org; michael.burrows@sams.ac.uk
+#
+############## Plot eDNA data from UNESCO World Heritage Sites #################
+
+# Load packages ----
+library(dplyr)
+library(terra)
+library(tidyr)
+library(stringr)
+library(purrr)
+library(arrow)
+library(data.table)
+library(patchwork)
+library(ggplot2)
+library(ggdist)
+library(see)
+fs::dir_create("figures")
+
+# Load data ----
+# Load temperature summaries by site
+speciesthermsite <- read_parquet("data/species_sites_risk.parquet")
+colnames(speciesthermsite)[1] <- "species"
+
+# Process the data ----
+# Convert to data.table
+speciesthermsitedt <- data.table(speciesthermsite)
+
+# Load species list
+specieslist <- open_dataset("data/output/occurrence.parquet")
+groups <- read.csv("data/groups.csv")
+group_lookup <- setNames(groups$group, paste(groups$rank, groups$taxon, sep = "_"))
+
+# Add a new column for group
+specieslist <- specieslist %>%
+    filter(taxonRank == "species") %>%
+    collect() %>%
+    distinct(scientificName, .keep_all = T) %>%
+    mutate(group = coalesce(
+        group_lookup[paste("class", class, sep = "_")],
+        group_lookup[paste("order", order, sep = "_")],
+        group_lookup[paste("phylum", phylum, sep = "_")]
+    )) %>%
+    select(species = scientificName, group)
+
+
+# Merge group information
+speciesthermsitedt_gp <- merge(speciesthermsitedt, specieslist[,c("species", "group")], by = "species")
+
+# Filter for fish only
+fishthermsitedt <- speciesthermsitedt_gp[group %in% c("fish", "sharks"), , ]
+fishthermsitedt <- fishthermsitedt[!is.na(fishthermsitedt$baseline_depthsurf_mean),]
+
+# Get functional traits data
+funcdiv <- rfishbase::species(unique(fishthermsitedt$species))
+funcdiv$to_include <- NA
+funcdiv$to_include[funcdiv$DepthRangeDeep <= 200] <- TRUE
+
+table(funcdiv$DemersPelag[is.na(funcdiv$to_include)])
+
+funcdiv$to_include[is.na(funcdiv$to_include) & 
+    funcdiv$DemersPelag %in% c("reef-associated", "pelagic-neritic")] <- TRUE
+
+table(funcdiv$DemersPelag[is.na(funcdiv$to_include)])
+
+sel_species <- funcdiv %>%
+    filter(to_include) %>%
+    select(species = Species, DepthRangeDeep, habitat = DemersPelag)
+
+# Convert to longer format
+fishthermsitedt_long <- fishthermsitedt %>%
+  select(species, higherGeography, baseline_depthsurf_mean, contains("depthsurf_dec50"), limit_dsurf) %>%
+  filter(species %in% unique(sel_species$species)) %>% # Remove this line to not filter by the traits
+  pivot_longer(cols = contains("depthsurf"), names_to = "scenario",
+               values_to = "sst") %>%
+  mutate(scenario = case_when(
+    scenario == "baseline_depthsurf_mean" ~ "site_current",
+    scenario == "ssp126_depthsurf_dec50_mean" ~ "site_ssp126_dec50",
+    scenario == "ssp245_depthsurf_dec50_mean" ~ "site_ssp245_dec50",
+    scenario == "ssp370_depthsurf_dec50_mean" ~ "site_ssp370_dec50",
+    scenario == "ssp585_depthsurf_dec50_mean" ~ "site_ssp585_dec50",
+  ))
+
+
+# Create a function to see at which scenario the species become at risk
+scenario_order <- c("site_current", "site_ssp126_dec50", 
+                    "site_ssp585_dec50")
+get_class <- function(x) {
+  if(all(x < 0)) {
+    return("not_risk")
+  } else {
+    return(scenario_order[min(which(x >= 0))])
+  }
+}
+
+# Retrieve sites names and latitudes
+sites <- vect("data/shapefiles/marine_world_heritage.gpkg")
+sites$latitude <- unlist(sapply(1:length(sites), function(x){
+  unname(terra::geom(centroids(sites[x,]))[,4])
+}))
+sites_org <- data.frame(sites = sites$name, latitude = sites$latitude)
+
+# Change names of sites
+sites_org <- sites_org %>%
+  rename(higherGeography = sites) %>%
+  mutate(higherGeography = case_when(
+    higherGeography == "Wadden Sea" ~ "Wadden Sea",
+    higherGeography == "Archipiélago de Revillagigedo" ~ "Revillagigedo",
+    higherGeography == "iSimangaliso Wetland Park" ~ "iSimangaliso",
+    higherGeography == "Ningaloo Coast" ~ "Ningaloo coast",
+    higherGeography == "Socotra Archipelago" ~ "Socotra Archipelago",
+    higherGeography == "Belize Barrier Reef Reserve System" ~ "Belize Barrier Reef",
+    higherGeography == "Cocos Island National Park" ~ "Cocos Island",
+    higherGeography == "Lord Howe Island Group" ~ "Lord Howe Island",            
+    higherGeography == "Sundarbans National Park" ~ "Sundarbans",                    
+    higherGeography == "Península Valdés" ~ "Peninsula Valdès",
+    higherGeography == "Gulf of Porto: Calanche of Piana, Gulf of Girolata, Scandola Reserve" ~ "Scandola",         
+    higherGeography == "Coiba National Park and its Special Zone of Marine Protection" ~ "Coiba",             
+    higherGeography == "Lagoons of New Caledonia: Reef Diversity and Associated Ecosystems" ~ "Lagoons of New Caledonia",         
+    higherGeography == "Shark Bay, Western Australia" ~ "Shark Bay",                                               
+    higherGeography == "Tubbataha Reefs Natural Park" ~ "Tubbataha Reefs",                                              
+    higherGeography == "Brazilian Atlantic Islands: Fernando de Noronha and Atol das Rocas Reserves" ~ "Fernando de Noronha",
+    higherGeography == "Everglades National Park" ~ "Everglades",                                                  
+    higherGeography == "Aldabra Atoll" ~ "Aldabra Atoll",                                                             
+    higherGeography == "Banc d'Arguin National Park" ~ "Banc d'Arguin",                                               
+    higherGeography == "French Austral Lands and Seas" ~ "French Austral Lands",
+  )) %>%
+  filter(!is.na(higherGeography)) %>%
+  arrange(desc(latitude)) %>%
+  distinct(higherGeography, .keep_all = T)
+
+# Update names on the longer object
+fishthermsitedt_long <- fishthermsitedt_long %>%
+  mutate(higherGeography = case_when(
+    higherGeography == "Wadden Sea" ~ "Wadden Sea",
+    higherGeography == "Archipiélago de Revillagigedo" ~ "Revillagigedo",
+    higherGeography == "iSimangaliso Wetland Park" ~ "iSimangaliso",
+    higherGeography == "Ningaloo Coast" ~ "Ningaloo coast",
+    higherGeography == "Socotra Archipelago" ~ "Socotra Archipelago",
+    higherGeography == "Belize Barrier Reef Reserve System" ~ "Belize Barrier Reef",
+    higherGeography == "Cocos Island National Park" ~ "Cocos Island",
+    higherGeography == "Lord Howe Island Group" ~ "Lord Howe Island",            
+    higherGeography == "The Sundarbans" ~ "Sundarbans",                    
+    higherGeography == "Peninsula Valdès" ~ "Peninsula Valdès",
+    higherGeography == "Gulf of Porto: Calanche of Piana, Gulf of Girolata, Scandola Reserve" ~ "Scandola",         
+    higherGeography == "Coiba National Park and its Special Zone of Marine Protection" ~ "Coiba",             
+    higherGeography == "Lagoons of New Caledonia: Reef Diversity and Associated Ecosystems" ~ "Lagoons of New Caledonia",         
+    higherGeography == "Shark Bay, Western Australia" ~ "Shark Bay",                                               
+    higherGeography == "Tubbataha Reefs Natural Park" ~ "Tubbataha Reefs",                                              
+    higherGeography == "Brazilian Atlantic Islands: Fernando de Noronha and Atol das Rocas Reserves" ~ "Fernando de Noronha",
+    higherGeography == "Everglades National Park" ~ "Everglades",                                                  
+    higherGeography == "Aldabra Atoll" ~ "Aldabra Atoll",                                                             
+    higherGeography == "Banc d'Arguin National Park" ~ "Banc d'Arguin",                                               
+    higherGeography == "French Austral Lands and Seas" ~ "French Austral Lands",
+  ))
+
+# Add means
+species_lims_full <- read_parquet(file.path("data", "species_thermal_lims.parquet"))
+species_lims_full <- species_lims_full %>%
+    filter(metric == "q_0.95") %>%
+    filter(depth == "depthsurf") %>%
+    select(species, q_0.95 = value)
+
+fishthermsitedt_long <- left_join(fishthermsitedt_long, species_lims_full)
+
+# Get the number of species in each site
+sites_species <- fishthermsitedt_long %>%
+  group_by(higherGeography, species) %>%
+  summarise(n = n()) %>%
+  group_by(higherGeography) %>%
+  summarise(n = n())
+
+# Join with the sites object
+sites_org <- left_join(sites_org, sites_species)
+
+# Create the plot object with all information
+plot_obj <- fishthermsitedt_long %>%
+  filter(scenario %in% c("site_current", "site_ssp126_dec50", "site_ssp585_dec50")) %>%
+  select(species, higherGeography, scenario, differences = sst, q_0.95) %>%
+  filter(!is.na(differences)) %>%
+  group_by(species, higherGeography) %>%
+  mutate(point_class = get_class(differences)) %>%
+  mutate(higherGeography = factor(higherGeography, levels = rev(sites_org$higherGeography),
+                                  labels = rev(paste(sites_org$higherGeography, paste0("(", sites_org$n, ")")))))
+plot_obj$point_class <- factor(plot_obj$point_class, levels = c("not_risk", "site_current", "site_ssp126_dec50", "site_ssp585_dec50"))
+
+# Extract sites SST
+sites_sst <- read_parquet("data/sites_sst_all.parquet")
+
+sites_sst <- sites_sst %>%
+    mutate(higherGeography = case_when(
+    higherGeography == "Wadden Sea" ~ "Wadden Sea",
+    higherGeography == "Archipiélago de Revillagigedo" ~ "Revillagigedo",
+    higherGeography == "iSimangaliso Wetland Park" ~ "iSimangaliso",
+    higherGeography == "Ningaloo Coast" ~ "Ningaloo coast",
+    higherGeography == "Socotra Archipelago" ~ "Socotra Archipelago",
+    higherGeography == "Belize Barrier Reef Reserve System" ~ "Belize Barrier Reef",
+    higherGeography == "Cocos Island National Park" ~ "Cocos Island",
+    higherGeography == "Lord Howe Island Group" ~ "Lord Howe Island",            
+    higherGeography == "The Sundarbans" ~ "Sundarbans",                    
+    higherGeography == "Peninsula Valdès" ~ "Peninsula Valdès",
+    higherGeography == "Gulf of Porto: Calanche of Piana, Gulf of Girolata, Scandola Reserve" ~ "Scandola",         
+    higherGeography == "Coiba National Park and its Special Zone of Marine Protection" ~ "Coiba",             
+    higherGeography == "Lagoons of New Caledonia: Reef Diversity and Associated Ecosystems" ~ "Lagoons of New Caledonia",         
+    higherGeography == "Shark Bay, Western Australia" ~ "Shark Bay",                                               
+    higherGeography == "Tubbataha Reefs Natural Park" ~ "Tubbataha Reefs",                                              
+    higherGeography == "Brazilian Atlantic Islands: Fernando de Noronha and Atol das Rocas Reserves" ~ "Fernando de Noronha",
+    higherGeography == "Everglades National Park" ~ "Everglades",                                                  
+    higherGeography == "Aldabra Atoll" ~ "Aldabra Atoll",                                                             
+    higherGeography == "Banc d'Arguin National Park" ~ "Banc d'Arguin",                                               
+    higherGeography == "French Austral Lands and Seas" ~ "French Austral Lands",
+  )) %>%
+  filter(!is.na(higherGeography))
+
+sites_sst <- sites_sst %>%
+  select(higherGeography, baseline_depthsurf_mean, contains("depthsurf_dec50")) %>%
+  pivot_longer(cols = contains("depthsurf"), names_to = "scenario",
+               values_to = "sst") %>%
+  mutate(scenario = case_when(
+    scenario == "baseline_depthsurf_mean" ~ "site_current",
+    scenario == "ssp126_depthsurf_dec50_mean" ~ "site_ssp126_dec50",
+    scenario == "ssp245_depthsurf_dec50_mean" ~ "site_ssp245_dec50",
+    scenario == "ssp370_depthsurf_dec50_mean" ~ "site_ssp370_dec50",
+    scenario == "ssp585_depthsurf_dec50_mean" ~ "site_ssp585_dec50",
+  )) %>%
+  filter(scenario %in% c("site_current", "site_ssp126_dec50", "site_ssp585_dec50")) %>%
+  filter(higherGeography %in% unique(fishthermsitedt_long$higherGeography)) %>%
+  mutate(point_class = scenario) %>%
+  left_join(sites_org[,c("higherGeography", "n")]) %>%
+  mutate(higherGeography = paste(higherGeography, paste0("(", n, ")")))
+sites_sst$point_class <- factor(sites_sst$point_class, levels = c("not_risk", "site_current", "site_ssp126_dec50", "site_ssp585_dec50"))
+
+
+# Make plots ----
+# Left plot - dots
+plot_a <- plot_obj %>%
+  group_by(higherGeography) %>%
+  distinct(species, .keep_all = T) %>%
+  ggplot() +
+  geom_violinhalf(aes(x = higherGeography, y = q_0.95),
+                  position = position_nudge(x = 0.3)) +
+  geom_jitter(aes(x = higherGeography, y = q_0.95, color = point_class), alpha = .3,
+              width = 0.12, shape = 16, show.legend = F) +
+  geom_linerange(data = sites_sst, aes(ymin = sst-0.1, ymax = sst+0.1, x = higherGeography, color = point_class),
+                 linewidth = 7, show.legend = T) +
+  scale_color_manual(values = c("grey90", "#d4b9da",  "#df65b0",  "#980043"),
+                     labels = c("Not at risk", "Current", "SSP1", "SSP5")) +
+  coord_flip() +
+  ylab("Temperature (°C)") + xlab(NULL) +
+  theme_light() +
+  scale_y_continuous(breaks = c(0, 10, 20, 30), limits = c(0, 32)) +
+  theme(panel.border = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        panel.grid.major.y = element_blank(),
+        legend.title = element_blank(),
+        legend.position = "bottom",
+        strip.background = element_blank(),
+        strip.text = element_text(color = "black"),
+        plot.margin = unit(c(0.1,0.1,0.1,0), "cm"));plot_a
+
+# Right plot - bars
+plot_b <- plot_obj %>%
+  group_by(higherGeography, scenario) %>%
+  summarise(percentage = sum(differences > 0)/length(differences)) %>%
+  ggplot() +
+  geom_bar(aes(x = higherGeography, y = percentage, fill = scenario), stat = "identity", position="dodge", width = .6) +
+  scale_fill_manual(values = c("#d4b9da",  "#df65b0",  "#980043")) +
+  scale_y_continuous(labels = c("0%", "", "50%", "", "100%")) +
+  coord_flip() +
+  ylab("Potentially Affected Fraction") + xlab(NULL) +
+  theme_light() +
+  theme(panel.border = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        panel.grid.major.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank(),
+        legend.title = element_blank(),
+        legend.position = "none",
+        strip.background = element_blank(),
+        strip.text = element_text(color = "black"),
+        plot.margin = unit(c(0,0,0,0), "cm"));plot_b
+
+# Put together
+plot_a + plot_b + plot_layout(widths = c(4, 1))
+
+# Save
+ggsave("figures/paf_fishes_filtered_v2_nl.png", width = 12, height = 7)
+ggsave("figures/paf_fishes_filtered_v2_nl.svg", width = 12, height = 7)
+
+plot_c <- plot_obj %>%
+  group_by(higherGeography, scenario) %>%
+  summarise(percentage = sum(differences > 0)/length(differences)) %>%
+  ggplot() +
+  geom_bar(aes(x = higherGeography, y = percentage, fill = scenario), stat = "identity", position="dodge", width = .6) +
+  scale_fill_manual(values = c("#d4b9da",  "#df65b0",  "#980043")) +
+  scale_y_continuous(labels = c("0%", "", "50%", "", "100%")) +
+  coord_flip() +
+  ylab("Potentially Affected Fraction") + xlab(NULL) +
+  theme_light() +
+  theme(panel.border = element_blank(),
+        panel.grid.minor.x = element_blank(),
+        panel.grid.major.y = element_blank(),
+        #axis.text.y = element_blank(),
+        axis.ticks.y = element_blank(),
+        legend.title = element_blank(),
+        legend.position = "none",
+        strip.background = element_blank(),
+        strip.text = element_text(color = "black"),
+        plot.margin = unit(c(0,0,0,0), "cm"));plot_c
+
+ggsave("figures/paf_fishes_filtered_onlybars_v2_nl.png", width = 8, height = 7)
+ggsave("figures/paf_fishes_filtered_onlybars_v2_nl.svg", width = 8, height = 7)