From 08c5745d458c79408e59b8bbf6b621f27246142d Mon Sep 17 00:00:00 2001 From: sdgamboa Date: Thu, 7 Dec 2023 23:16:38 -0500 Subject: [PATCH] update get ltp script --- inst/scripts/get_living_tree_2023.R | 58 ++++++++++++++--------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/inst/scripts/get_living_tree_2023.R b/inst/scripts/get_living_tree_2023.R index 25c3217..15150d7 100644 --- a/inst/scripts/get_living_tree_2023.R +++ b/inst/scripts/get_living_tree_2023.R @@ -1,4 +1,5 @@ +# Setup ------------------------------------------------------------------- library(ape) library(taxonomizr) library(dplyr) @@ -7,6 +8,8 @@ library(tidyr) library(stringr) library(phytools) +## The file accessionTaxa.sql was downloaded with taxonomizr +## The file is kind of large, so better to downloaded just one time sql <- '~/accessionTaxa.sql' treeUrl <- 'https://imedea.uib-csic.es/mmg/ltp/wp-content/uploads/ltp/LTP_all_08_2023.ntree' treeFilePath <- file.path('inst', 'extdata', 'LTP_all_08_2023.ntree') @@ -24,27 +27,36 @@ if (file.exists(treeFilePath)) { write.tree(tree, treeFilePath) } - # Accession to taxids ----------------------------------------------------- accession_rgx <- '_-.*--_' -accessions <- str_extract(tree$tip.label, accession_rgx) |> +accessions <- tree$tip.label |> + str_extract(accession_rgx) |> str_remove_all('[_-]') -taxnames <- str_extract(tree$tip.label, paste0("^.*", accession_rgx)) |> +taxnames <- tree$tip.label |> + str_extract(paste0("^.*", accession_rgx)) |> str_remove(accession_rgx) |> str_remove('-$') |> str_replace_all('_', ' ') |> str_squish() +taxids <- accessionToTaxa(accessions, sql, version = 'base') # base doesn't include decimal version -taxids <- accessionToTaxa( - accessions = accessions, sqlFile = sql, version = 'base' -) +# Complete missing taxa --------------------------------------------------- missing_taxa <- taxnames[which(is.na(taxids))] +## Below, the two only cases that need to be updated manually missing_taxa[which(missing_taxa == 'Micromonospora okii')] <- 'Micromonospora sp. TP-A0468' missing_taxa[which(missing_taxa == 'Sala cibi')] <- 'Salella cibi' not_missing_anymore_taxa <- taxizedb::name2taxid(missing_taxa, db = 'ncbi') taxids[which(is.na(taxids))] <- not_missing_anymore_taxa +# Get full taxonomy ------------------------------------------------------- +## beware of the unique function call here +## which might cause lenght of taxonomy and taxids to not match +## This is solved with left_join when needed taxonomy <- taxizedb::classification(unique(taxids), db = 'ncbi') + +## Even when taxids are valid, some of them might need to be updated +## when using taxizedb instead of taxize. +## taxizedb stores the most current file in the cache missing_taxonomy_positions <- which(map_lgl(taxonomy, ~ all(is.na(.x)))) not_missing_anymore_taxonomy <- taxize::classification( names(missing_taxonomy_positions), db = 'ncbi' @@ -52,9 +64,7 @@ not_missing_anymore_taxonomy <- taxize::classification( chr_vct <- map_chr(not_missing_anymore_taxonomy, ~ tail(.x$id, 1)) for (i in seq_along(chr_vct)) { pos <- which(taxids == names(chr_vct[i])) - message( - 'Replacing ', unique(taxids[pos]), ' with ', chr_vct[i] - ) + message('Replacing ', unique(taxids[pos]), ' with ', chr_vct[i]) taxids[pos] <- chr_vct[i] } @@ -101,9 +111,11 @@ getMRCA <- function(tree, tips) { res } -tx <- paste0(taxonomic_ranks, '_taxid') -tx[which(tx == 'superkingdom_taxid')] <- 'kingdom_taxid' -mrcas <- flatten(purrr::map(tx, ~ split(tip_data, factor(tip_data[[.x]])))) +ranks_ids <- paste0(taxonomic_ranks, '_taxid') +ranks_ids[which(ranks_ids == 'superkingdom_taxid')] <- 'kingdom_taxid' + +## TODO I'm currently in this line. Check node data. +mrcas <- flatten(purrr::map(ranks_ids, ~ split(tip_data, factor(tip_data[[.x]])))) mrcas <- purrr::map(mrcas, ~ .x[['tip_label']]) mrcas <- map_int(mrcas, ~ getMRCA(tree, .x)) mrcas <- mrcas[!is.na(mrcas)] @@ -156,34 +168,18 @@ nodes_new_taxonomy <- purrr::map(nodes_taxonomy, ~ { node_data <- left_join(nodes_with_taxid, nodes_new_taxonomy, by= 'taxid') node_data$rank <- taxizedb::taxid2rank(node_data$taxid, db = 'ncbi') - +# Small adjustments ------------------------------------------------------- node_data$NCBI_ID <- taxPPro::addRankPrefix(node_data$taxid, node_data$rank) tip_data$NCBI_ID <- taxPPro::addRankPrefix(tip_data$taxid, tip_data$rank) -## ape write.tree won't allow commas and spaces when writin tree name -# tree_data$tip_label <- gsub(' ', '_', tree_data$tip_label) -# tree_data$tip_label <- gsub('[,;]', '-', tree_data$tip_label) -# tree_data$tip_label <- gsub('"', '', tree_data$tip_label) -# tree_data$tip_label <- gsub("'", '', tree_data$tip_label) -# tree_data$tip_label <- gsub("\\[T\\]", '', tree_data$tip_label) -# -# tree$tip.label<- gsub(' ', '_', tree$tip.label) -# tree$tip.label <- gsub('[,;]', '-', tree$tip.label) -# tree$tip.label <- gsub('"', '', tree$tip.label) -# tree$tip.label <- gsub("'", '', tree$tip.label) -# tree$tip.label <- gsub("\\[T\\]", '', tree$tip.label) - - +## A small check all(tree$tip.label %in% tip_data$tip_label) - -# tip_data$taxid +## It's better to update taxon name and rank here tip_data$Taxon_name <- taxizedb::taxid2name(tip_data$taxid, db = 'ncbi') tip_data$Rank <- taxizedb::taxid2rank(tip_data$taxid, db = 'ncbi') tip_data$rank <- NULL - -# node_data$taxid node_data$Taxon_name <- taxizedb::taxid2name(node_data$taxid, db = 'ncbi') node_data$Rank <- taxizedb::taxid2rank(node_data$taxid, db = 'ncbi') node_data$rank <- NULL