GOM_Metazoan_COI_script.Rmd

---
title: "Quick reference library for Emily Lancaster, GOM Invertebrates COI, Erin Grey 2023-10-20"
output: html_document
---

```{r setup, include=FALSE}
rm(list=ls()) # clear working environment
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```

### UPDATE EACH OF THESE FOUR VARIABLES FOR YOUR ANALYSIS
```{r define_variables}
entrez_key <- "7c5ac035201a1835b5a81de1b74ec8613d08" #GET YOUR OWN ENTREZ KEY AND PUT IT HERE!
locus = "COI" #name of target locus, see workflow notes above for your choices
output_folder <- "gom_inverts_2023-10-17" #name of your output folder every time
species_list <- read.csv("gom_inverts_2023-10-17/a03_BESTNAMES_v1.csv") # your species list
order_list <- read.csv("MetazoaSpeciesByOrder_2023-09-19.csv") # list of metazoan orders
```

### LOAD PACKAGES, DEFINE TERM, AND CREATe DATABASE SKELETONS. NOTE - LOADING PACKAGES CAN BE TRICKY
```{r load_packages_and_terms}
library(taxizedb) #checks taxonomy
library(rentrez) #queries ENTREZ databases and downloads accessions
library(AnnotationBustR) #finds longest accessions, slice genes from mitogenomes
library(reutils) #other packages need it
library(ape) #convert fasta, fastq, etc.
library(Biostrings)
library(genbankr) #parse genbank files
library(ggplot2) #plots

set_entrez_key(entrez_key) #set the Entrez API key
dir.create(output_folder)

# create ENTREZ Search terms
data(mtDNAterms) #AnnotationBustR's list of synonyms for different loci
more_12Ssynonyms <- data.frame(Locus="rRNA_12S", Type="rRNA", Name= "small ribosomal RNA subunit RNA") # other synonyms that we find go here and get added to AnnotationBustR's list
mtDNAterms <- rbind(mtDNAterms, more_12Ssynonyms) #format
target_locus_synonyms <- mtDNAterms[mtDNAterms$Locus==locus,] #the target synonyms
target_locus_synonyms$Terms <- 
    paste0("OR ", target_locus_synonyms$Name, "[TITL]") # format for ENTREZ search terms
target_locus_synonyms$Terms[1] <- 
    paste0("AND (", target_locus_synonyms$Name[1], "[TITL]") # first term starts with "AND ("
target_locus_synonyms$Terms[dim(target_locus_synonyms)[1]] <- 
    paste0("OR ",   target_locus_synonyms$Name[dim(target_locus_synonyms)[1]], "[TITL])") #last term ends with a ")"
target_locus_searchterm <- paste(as.vector(target_locus_synonyms$Terms), collapse=" ") # the big ENTREZ search term

# create empty dataframes to fill in during loops
tax_df <- as.data.frame(matrix(nrow=1, ncol=7, dimnames=list(NULL, c("tax_query", "phylum", "class", "order", "family", "genus", "species")))) # create taxonomy skeleton
tax_df2 <- as.data.frame(matrix(nrow=1, ncol=7, dimnames=list(NULL, c("tax_query", "phylum", "class", "order", "family", "genus", "species")))) # create taxonomy skeleton
order_reps <- as.data.frame(matrix(nrow=1, ncol=6, dimnames=list(NULL, c("phylum", "class", "order", "species_id", "ids_mitogenome","ids_target")))) # create missing orders representatives skeleton
a04_REFDB <- data.frame(seq_header=NA, sequence=NA, seq_accession=NA, type=NA, species=NA) #create reference database skeleton
t = 1 # taxonomy loop counter
o = 1 # missing orders loop counter
j = 1 # missing orders species loop counter
i = 1 # ENTREZ search loop counter
s = 1 # sequence scrape loop counter
m = 1 # mitogenome scrape loop counter
z = 1 # output format loop counter
d = 1 # dada2 taxonomy counter
```

## The rest of this script should just run itself. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r taxonomy}
taxonomies_cls <- taxizedb::classification(species_list[,5], db="ncbi")#get full taxonomies for each species, output is a list-like "classification" object that sucks
#loop through the sucky classification object to populate the empty dataframe
for (t in 1:length(taxonomies_cls)) {
  x <- as.data.frame(taxonomies_cls[t])
  tax_query <- strsplit(colnames(x)[1], split="[.]")
  tax_query <- paste(tax_query[[1]][1], tax_query[[1]][2], sep=" ")
  # update the taxonomy dataframe skeleton
  tax_df[nrow(tax_df)+1,] <- rep("NA", dim(tax_df)[2])
  tax_df$tax_query[t] <- tax_query
  if (dim(x)[1]>1) {
        c1 <- paste0(x[which(x[,2]=="phylum"),c(1,3)], collapse="_")
        c2 <- paste0(x[which(x[,2]=="class"),c(1,3)], collapse="_")
        c3 <- paste0(x[which(x[,2]=="order"),c(1,3)], collapse="_")
        c4 <- paste0(x[which(x[,2]=="family"),c(1,3)], collapse="_")
        c5 <- paste0(x[which(x[,2]=="genus"),c(1,3)], collapse="_")
        c6 <- paste0(x[which(x[,2]=="species"),c(1,3)][1,], collapse="_")
  }
  
  if (exists("c1")){
      if (length(c1) > 0) {
        tax_df$phylum[t] <- c1
        } else {
        tax_df$phylum[t] <- "na"    
        }
    } 
  
  if (exists("c2")){
      if (length(c2) > 0) {
        tax_df$class[t] <- c2
        } else {
        tax_df$class[t] <- "na"    
        }
    } 
  
  if (exists("c3")){
      if (length(c3) > 0) {
        tax_df$order[t] <- c3
        } else {
        tax_df$order[t] <- "na"    
        }
    } 
  
   if (exists("c4")){
      if (length(c4) > 0) {
        tax_df$family[t] <- c4
        } else {
        tax_df$family[t] <- "na"    
        }
    } 
  
  if (exists("c5")){
      if (length(c5) > 0) {
        tax_df$genus[t] <- c5
        } else {
        tax_df$genus[t] <- "na"    
        }
    } 
  
   if (exists("c6")){
      if (length(c6) > 0) {
        tax_df$species[t] <- c6
        } else {
        tax_df$species[t] <- "na"    
        }
    } 
  
  
 suppressWarnings(rm(list = c("x", "tax_query", "c1","c2","c3","c4","c5","c6")))
}

a03_BESTNAMES <- merge(species_list, tax_df, by.x="search_name", by.y="tax_query", all.x=TRUE)
a03_BESTNAMESwithOrderTax <- droplevels(subset(a03_BESTNAMES, !is.na(a03_BESTNAMES$order))) # species with taxonomies
orders_missing <- droplevels(subset(order_list, !(order_list$order %in% unique(factor(a03_BESTNAMESwithOrderTax$order))))) # orders that are missing from species list
row.names(orders_missing) = NULL # format
```

## Find up to 3 mitogenomes or target sequences for missing orders. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r missing_orders_seqs}
# find sequences for up to three species for each missing order; search mitogenomes first, then regular accessions
while (o <= dim(orders_missing)[1]) { # for every missing order
    cat("\r", "finding ref seqs for order", o, "of", dim(orders_missing)[1]) #counter
    STOPIT <- "NO" # set STOPIT to NO for this order
    while (STOPIT == "NO"){ # keep searching as long as STOPIT = "NO"
        species_ids_ls <- strsplit(orders_missing$spp_list[[o]], ";") # get the species ids for that order
        if (is.na(species_ids_ls[[1]][1])) {STOPIT <- "YES"} #IF THERE ARE NO GENBANK SPECIES FOR THIS ORDER, STOP SEARCHING FOR IT
        if (length(species_ids_ls[[1]])<=3) { 
            species_ids <- species_ids_ls[[1]] #format species ids
            for (j in 1:length(species_ids)){ # for each species
                search_name <- paste0("txid",species_ids[j],"[Organism]")
                mitogenomes <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) # search mitogenomes
                if(class(mitogenomes)[1] != "try-catch") { #if the search went through
                    if (length(mitogenomes$ids)>0) {
                    mito_id <- sample(mitogenomes$ids,1) #choose a random mitogenome for this species
                    row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o], species_ids[i], mito_id, "NA")
                    order_reps <- rbind(order_reps, row_info)
                    rm(list=c("mito_id","row_info"))
                    Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
                    } else {
                        targets <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, target_locus_searchterm, collapse=" "), retmax=999999))
                        if(class(targets)[1] != "try-catch") {
                            if (length(targets$ids)>0) {
                                target_id <- sample(targets$ids,1) #choose a random accession for this species
                                row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o],species_ids[j], "NA", target_id)
                                order_reps <- rbind(order_reps,row_info)
                                rm(list=c("target_id","row_info"))
                                Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
                                }
                            }
                            rm(targets)
                        }
                        rm(search_name); rm(mitogenomes)
                    }
                    STOPIT <- "YES"
                }
            }   #If there are 3 or fewer species, search them all for a mitogenome or accession then STOP SEARCHING
        if (length(species_ids_ls[[1]])>3){ 
            species_ids <- sample(species_ids_ls[[1]]) #randomize the species for each order
            finds = 0 #start with zero finds
            for (j in 1:length(species_ids)){
                search_name <- paste0("txid",species_ids[j],"[Organism]")
                mitogenomes <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) # search mitogenomes
                if(class(mitogenomes)[1] != "try-catch") { #if the search went through
                    if (length(mitogenomes$ids)>0) {
                        mito_id <- sample(mitogenomes$ids,1) #choose a random mitogenome for this species
                        row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o], species_ids[j], mito_id, "NA")
                        order_reps <- rbind(order_reps,row_info)
                        finds=finds+1 #add to find count
                        rm(mito_id) 
                        Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
                    } else {
                            targets <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, target_locus_searchterm, collapse=" "), retmax=999999))
                            if(class(targets)[1] != "try-catch") {
                                if (length(targets$ids)>0) {
                                    target_id <- sample(targets$ids,1) #choose a random accession for this species
                                    row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o],species_ids[j], "NA", target_id)
                                    order_reps <- rbind(order_reps,row_info)
                                    finds=finds+1 #add to find count
                                    rm(target_id); rm(row_info)
                                    Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
                                    }
                                }
                                rm(targets)
                            }
                            rm(search_name); rm(mitogenomes)
                            if (finds >= 3) break
                            STOPIT <- "YES"
                        }
                } #If there are >=3 species, randomize and search mitogenomes or accessions until you get 3 or have searched them all
            }
            rm(species_ids_ls)
            o=o+1
    }
}
order_reps <- na.omit(order_reps) #substract the first row because its all NAs
order_reps$search_name <- "NA"; order_reps$source_binomial<- "NA";order_reps$source<-"NA"; order_reps$submitted_name<-"NA"; order_reps$matched_name2<-"NA"; order_reps$n_mitogenome<-"NA"; order_reps$n_target<-"NA"; order_reps$species<-"NA"; order_reps$family<-"NA"; order_reps$genus<-"NA" #make column names same as a)# best names for merging
order_reps$n_mitogenome <- ifelse(order_reps$ids_mitogenome!="NA",1,0) #update the n_mitogenome column
order_reps$n_target <- ifelse(order_reps$ids_target!="NA",1,0) #update the n_target column
```

## Search Entrez for mitogenome and target sequence for your species. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r search_entrez}
while (i <= dim(a03_BESTNAMES)[1]){
  cat("\r", "finding Genbank accessions & mitogenomes for species", i, "of", dim(a03_BESTNAMES)[1]) #counter
  # define search terms for species
  search_name <- paste0(a03_BESTNAMES$search_name[i],"[ORGN]") #format species name for ENTREZ search
  search_term <- paste(search_name, target_locus_searchterm, collapse=" ") #mash species & locus terms into one search term
  # search ENTREZ for mitogenomes and then for accessions
   tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) -> mitogenomes # search mitogenomes
      if(class(mitogenomes)[1] != "try-catch") {
        a03_BESTNAMES$n_mitogenome[i] <-mitogenomes$count #add mitogenome count names dataframe
        a03_BESTNAMES$ids_mitogenome[i] <-paste(mitogenomes$ids, collapse="|") #add mitogenome ids to name dataframe
            tryCatch(entrez_search(db="nucleotide", term <- search_term, retmax=9999)) -> targets # search accessions
                if(class(targets)[1] != "try-catch") {
                    a03_BESTNAMES$n_target[i] <-targets$count
                    a03_BESTNAMES$ids_target[i] <- paste(targets$ids, collapse="|")
                }
            }
  Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
  # reset loop variables
  mitogenomes <- "na"
  targets <- "na"
  search_name <- "na"
  search_term <- "na"
  i <- i + 1
}
a03_BESTNAMES$species_id <- "NA" #make similar to order_reps for merging
orders_rep <- order_reps[,colnames(a03_BESTNAMES)] # format the order_reps dataframe to add to the a03_BESTNAMES datafame
a03_BESTNAMES<- rbind(a03_BESTNAMES, order_reps) # add the order_reps dataframe to the a03BESTANAMES dataframe
a03_BESTNAMES$n_target <- as.numeric(a03_BESTNAMES$n_target) #format
a03_BESTNAMES$n_mitogenome <- as.numeric(a03_BESTNAMES$n_mitogenome) #format
row.names(a03_BESTNAMES) = NULL # format
write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v2.csv"), row.names = FALSE) # save the mitogenome and target accessions ids for scraping
```

```{r scrape_accessions}

while (s <= dim(a03_BESTNAMES)[1]){ #for every good species name
    cat("\r","scraping sequences for species", s, "of",dim(a03_BESTNAMES)[1])
    ids <- "na"
    seqs_target <- "na"
    if (a03_BESTNAMES$n_target[s]>0 && a03_BESTNAMES$n_target[s]<100) { # scrape GenBank target sequences if available, but skip if >= 100 targets
     ids <- c(unlist(strsplit(a03_BESTNAMES$ids_target[s], split="\\|")))
      } else if  (a03_BESTNAMES$n_target[s]>100) {   # if more than 200 accessions, randomly select 100  
     ids <- sample(c(unlist(strsplit(a03_BESTNAMES$ids_target[s], split="\\|"))),100)
      }
    if (ids[1] !="na"){ # if there are accessions, fetch them from GenBank
        seqs_target <- tryCatch(entrez_fetch(db="nuccore", id=ids, rettype="fasta"))
        }
        if(class(seqs_target) != "try-catch"){
            if (seqs_target != "na"){
            write(seqs_target, file.path(output_folder, paste(a03_BESTNAMES$search_name[s], paste0(locus, ".fasta")))) # formatting - write out the sequences
            fasta_target <- readDNAStringSet(file.path(output_folder, paste(a03_BESTNAMES$search_name[s], paste0(locus, ".fasta"))), format="fasta") #formatting - read them back in as fasta
            seqs_target_accessions <- entrez_fetch(db="nuccore", id=ids, rettype="acc") # get accession numbers
            seq_header <- names(fasta_target) #formatting
            sequence <- paste(fasta_target) #formatting
            seq_accession <- unlist(strsplit(seqs_target_accessions, split="\n")) # formatting
            tempDB <- data.frame(seq_header, sequence, seq_accession, type="accession", species=a03_BESTNAMES$search_name[s]) # make a temporary database with all sequences, their header, accession number, etc.
            a04_REFDB <- rbind(a04_REFDB, tempDB) # append temporary database to the full database
                }
            s=s+1
            }
    #reset loop variables
    fasta_target<-"na"; seqs_target_accessions<-"na"; seq_header<-"na"; sequence<-"na"; seq_accession<-"na";
    tempDB <-"na"
    #slow down request to the Entrez server or you'll get kicked out
    Sys.sleep(1)
 }

```

```{r scrape_mitogenomes}
while (m <= dim(a03_BESTNAMES)[1]) { #for every good species name
    cat("\r","scraping mitogenomes for species", m, "of", dim(a03_BESTNAMES)[1])
    mito_ids <- "na"
    if (a03_BESTNAMES$n_mitogenome[m]>0 && a03_BESTNAMES$n_mitogenome[m]<20) { #if mitogenomes available and <20
        mito_ids <- unlist(strsplit(a03_BESTNAMES$ids_mitogenome[m], split="\\|")) #format ids
    } else if (a03_BESTNAMES$n_mitogenome[m]>20) { #if >20 mitogenomes, sample 20 randomly
        mito_ids <- sample(unlist(strsplit(a03_BESTNAMES$ids_mitogenome[m], split="\\|")),20) # format ids  
    }
   
    if (mito_ids[1] != "na") { #if there are mitogenome ids for species m
        mito_accessions <- tryCatch(entrez_fetch(mito_ids, db="nuccore", rettype="acc")) #get accessions for each id
        if (class(mito_accessions) != "try-catch"){ #if no api error
            mito_accessions <- unlist(strsplit(mito_accessions, split="\n")) #format accession numbers
            for (n in 1:length(mito_accessions)){ # loop through and scrape each mitogenome accession
                gb <- readGenBank(GBAccession(mito_accessions[n])) # get the Genbank annotation for accession
                target_feature <- which(genes(gb)$gene %in% as.character(target_locus_synonyms$Name)) # find target locus annotation metadata (note: use otherFeatures for for rRNAs, tRNAs, etc. and use genes(gb) for genes like COI, CYTB, NADH, etc.)
                new_row <- c(paste("Unparsed mitochondrion", mito_accessions[n], sep=" "), "na", mito_accessions[n], "scrape", species=a03_BESTNAMES$search_name[m])
                if(length(target_feature) > 0) { # if target feature is found
                    target_range <- genes(gb)@ranges[target_feature][1] #extract the target range info
                    target_strand <-  genes(gb)@strand[target_feature[1]] #extract the target strand info (+ or -)
                    target_seq <- subseq(getSeq(gb), start=target_range@start, width=target_range@width) #scrape seq
                    scrapedseq_binomial <- names(target_seq) #get the binomial name
                    scraped_seq <- paste(target_seq) #format
                    scraped_range <- paste(target_range) #format
                    new_row <- c(paste(names(target_seq),"mitochondrion", mito_accessions[n], sep=" "), paste(target_seq), mito_accessions[n], "scrape", species=a03_BESTNAMES$search_name[m]) #update information
                }
            a04_REFDB <- rbind(a04_REFDB, new_row) # update the database
            rm(gb, target_feature, target_strand, target_seq, scrapedseq_binomial, scraped_seq, scraped_range, new_row) # reset n loop variables
            Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
            } # close n loop (each "n" mitogenome accession per "m" species)
        }
    }
    m=m+1 #update species m variable
    rm(mito_ids, mito_accessions) # reset loop variables
    Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close for each species m loop

a04_REFDB <- a04_REFDB[-1,] #format - remove the top row of NAs
a04_REFDB_unparsed <- subset(a04_REFDB, sequence == "na")
a04_REFDB_parsed <- subset(a04_REFDB, sequence != "na")
write.csv(a04_REFDB_parsed, file.path(output_folder, "a04_REFDB.csv"), row.names=FALSE)
```

```{r remove_duplicates}
a05_UNIQUEDB <- a04_REFDB[!duplicated(a04_REFDB$sequence),] #remove duplicates
for (i in  1:dim(a05_UNIQUEDB)[1]){ # for every row in the unique db file
  dups <- subset(a04_REFDB, sequence == a05_UNIQUEDB$sequence[i]) # find identical sequences in the fill db file 
  a05_UNIQUEDB$duplicate_accessions[i] <- paste(dups$seq_accession, collapse = "|") # paste all those accessions together into a new unique db field
  a05_UNIQUEDB$duplicate_species[i] <- paste(dups[!duplicated(dups$species),"species"], collapse = "|")
}
row.names(a05_UNIQUEDB) <- NULL #format
write.csv(a05_UNIQUEDB, file.path(output_folder, "a05_UniqueRefDB.csv"), row.names=FALSE)
```

```{r format_dada2}
a05_UNIQUEDB_format <- a05_UNIQUEDB #find seqs that are missing species names
row.names(a05_UNIQUEDB_format) <- NULL #format
for (z in 1:dim(a05_UNIQUEDB_format)[1]){
    name_string <- strsplit(a05_UNIQUEDB_format$seq_header[z], " ")[[1]]
    if (a05_UNIQUEDB_format$type[z]=="scrape"){
        a05_UNIQUEDB_format$species[z] <- paste(name_string[1], name_string[2], sep=" ")
    } else {
        if(a05_UNIQUEDB_format$type[z]=="accession"){
        a05_UNIQUEDB_format$species[z] <- paste(name_string[2], name_string[3], sep=" ")
        }
    }
    rm(name_string)
}

a03_BESTNAMES$search_name <- as.character(a03_BESTNAMES$search_name) #format
a05_UNIQUEDB_format$species <- as.character(a05_UNIQUEDB_format$species) #format
write.csv(a05_UNIQUEDB, file.path(output_folder, "a05_UniqueRefDB.csv"), row.names=FALSE)
a06_dadaDB <- merge(x=a05_UNIQUEDB_format, y=a03_BESTNAMES, by.x="species", by.y="search_name", all.x=TRUE, all.y=FALSE) 


```

```{r taxonomy_dada2}
taxonomies_all_cls <- taxizedb::classification(a06_dadaDB[,1], db="ncbi")#get full taxonomies for each species, output is a list-like "classification" object that sucks

#loop through the sucky classification object to populate the empty dataframe
for (d in 1:length(taxonomies_all_cls)) {
  x <- as.data.frame(taxonomies_all_cls[d])
  tax_query <- strsplit(colnames(x)[1], split="[.]")
  tax_query <- paste(tax_query[[1]][1], tax_query[[1]][2], sep=" ")
  # update the taxonomy dataframe skeleton
  tax_df2[nrow(tax_df2)+1,] <- rep("NA", dim(tax_df2)[2])
  tax_df2$tax_query[d] <- tax_query
  if (dim(x)[1]>1) {
        c1 <- paste0(x[which(x[,2]=="phylum"),c(1,3)], collapse="_")
        c2 <- paste0(x[which(x[,2]=="class"),c(1,3)], collapse="_")
        c3 <- paste0(x[which(x[,2]=="order"),c(1,3)], collapse="_")
        c4 <- paste0(x[which(x[,2]=="family"),c(1,3)], collapse="_")
        c5 <- paste0(x[which(x[,2]=="genus"),c(1,3)], collapse="_")
        c6 <- paste0(x[which(x[,2]=="species"),c(1,3)][1,], collapse="_")
  }
  
  if (exists("c1")){
      if (length(c1) > 0) {
        tax_df2$phylum[d] <- c1
        } else {
        tax_df2$phylum[d] <- "na"    
        }
    } 
  
  if (exists("c2")){
      if (length(c2) > 0) {
        tax_df2$class[d] <- c2
        } else {
        tax_df2$class[d] <- "na"    
        }
    } 
  
  if (exists("c3")){
      if (length(c3) > 0) {
        tax_df2$order[d] <- c3
        } else {
        tax_df2$order[d] <- "na"    
        }
    } 
  
   if (exists("c4")){
      if (length(c4) > 0) {
        tax_df2$family[d] <- c4
        } else {
        tax_df2$family[d] <- "na"    
        }
    } 
  
  if (exists("c5")){
      if (length(c5) > 0) {
        tax_df2$genus[d] <- c5
        } else {
        tax_df2$genus[d] <- "na"    
        }
    } 
  
   if (exists("c6")){
      if (length(c6) > 0) {
        tax_df2$species[d] <- c6
        } else {
        tax_df2$species[d] <- "na"    
        }
    } 
  
  
 suppressWarnings(rm(list = c("x", "tax_query", "c1","c2","c3","c4","c5","c6")))
}
tax_df2 <- tax_df2[!duplicated(tax_df2$tax_query),] #remove duplicated taxa
tax_df2$tax_query <- as.character(tax_df2$tax_query) #format for merging
a06_dadaDB$species <- as.character(a06_dadaDB$species) #format for merging
a06_dadaDB_tax <- merge(a06_dadaDB, tax_df2, by.x="species", by.y="tax_query", all.x=TRUE) #merge taxonomic information into ref database
a06_dadaDB_tax[a06_dadaDB_tax=="NA"] <- NA
colnames(a06_dadaDB_tax)[17] <- "species.x"

tax_fails <- a06_dadaDB_tax[is.na(a06_dadaDB_tax$order.x) & is.na(a06_dadaDB_tax$order.y),] #rows that are missing all higher-level taxonomic information
a06_dadaDB_tax_clean1 <- a06_dadaDB_tax[!(is.na(a06_dadaDB_tax$order.x) & is.na(a06_dadaDB_tax$order.y)),] # remove tax_fails
a06_dadaDB_tax_clean2 <- a06_dadaDB_tax_clean1
a06_dadaDB_tax_clean2$phylum =  a06_dadaDB_tax_clean2$phylum.y
a06_dadaDB_tax_clean2$class =  a06_dadaDB_tax_clean2$class.y
a06_dadaDB_tax_clean2$order =  a06_dadaDB_tax_clean2$order.y
a06_dadaDB_tax_clean2$family =  a06_dadaDB_tax_clean2$family.y
a06_dadaDB_tax_clean2$genus =  a06_dadaDB_tax_clean2$genus.y
a06_dadaDB_tax_clean2$species = a06_dadaDB_tax_clean2$species.y

a06_dadaDB_tax_clean2$phylum[is.na(a06_dadaDB_tax_clean2$phylum.y)] <- a06_dadaDB_tax_clean2$phylum.x[is.na(a06_dadaDB_tax_clean2$phylum.y)]
a06_dadaDB_tax_clean2$order[is.na(a06_dadaDB_tax_clean2$order.y)] <- a06_dadaDB_tax_clean2$order.x[is.na(a06_dadaDB_tax_clean2$order.y)]
a06_dadaDB_tax_clean2$class[is.na(a06_dadaDB_tax_clean2$class.y)] <- a06_dadaDB_tax_clean2$class.x[is.na(a06_dadaDB_tax_clean2$class.y)]
a06_dadaDB_tax_clean2$family[is.na(a06_dadaDB_tax_clean2$family.y)] <- a06_dadaDB_tax_clean2$family.x[is.na(a06_dadaDB_tax_clean2$family.y)]
a06_dadaDB_tax_clean2$genus[is.na(a06_dadaDB_tax_clean2$genus.y)] <- a06_dadaDB_tax_clean2$genus.x[is.na(a06_dadaDB_tax_clean2$genus.y)]
a06_dadaDB_tax_clean2$species[is.na(a06_dadaDB_tax_clean2$species.y)] <- a06_dadaDB_tax_clean2$species.x[is.na(a06_dadaDB_tax_clean2$species.y)]
my_cols <- c("phylum", "class","order","family","genus","species")
a06_dadaDB_tax_clean2$header <- do.call(paste, c(a06_dadaDB_tax_clean2[my_cols], sep = ";"))
a06_dadaDB_tax_clean2$header2 <- paste(">",a06_dadaDB_tax_clean2$header, sep="")

REFDB_dada2 <- a06_dadaDB_tax_clean2[,c("header2","sequence")]

write.table(REFDB_dada2, "REFDB_dada2.fasta", sep="\n", col.names=FALSE, row.names=FALSE, quote = FALSE)

```

```{r update_species_summary}
#Count reference sequences by species
counts_species <- as.data.frame(table(a05_UNIQUEDB$species)) #count the number of unique reference sequences per species
colnames(counts_species) <- c("species", "n_unique_seqs") #format
a03_BESTNAMES <- merge(a03_BESTNAMES, counts_species, by.x= "search_name", by.y="species", all.x=TRUE, all.y=TRUE) # update the summary database with reference sequence counts

#add in duplicates information to BESTNAMES
a03_BESTNAMES$dup_accessions <- "na"
a03_BESTNAMES$dup_species <- "na"
a03_BESTNAMES$dup_species_n <- "na"

for (b in 1:dim(a03_BESTNAMES)[1]){
  dup_accessions <- paste(a05_UNIQUEDB[a05_UNIQUEDB$species == a03_BESTNAMES$search_name[b], "duplicate_accessions"], collapse="|")
  dup_accessions <- unlist(strsplit(dup_accessions, split="\\|"))
  dup_accessions_unique <- paste(unique(dup_accessions), collapse="|")
  
  dup_species <- paste(a05_UNIQUEDB[a05_UNIQUEDB$species == a03_BESTNAMES$search_name[b], "duplicate_species"], collapse="|")
  dup_species <- unlist(strsplit(dup_species, split="\\|"))
  dup_species_unique <- unique(dup_species)
  
    if (length(dup_accessions)>0){
  a03_BESTNAMES$dup_accessions[b]<-dup_accessions_unique
  a03_BESTNAMES$dup_species[b]<- paste(dup_species_unique, collapse="|")
  a03_BESTNAMES$dup_species_n[b]<- length(dup_species_unique)
  }
  
  #reset variables
  rm(dup_accessions)
  rm(dup_species)
}

a03_BESTNAMES$n_all_seqs <- a03_BESTNAMES$n_mitogenome + a03_BESTNAMES$n_target

write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v3.csv"), row.names=FALSE)

n_species <- dim(a03_BESTNAMES)[1]; n_species_target <- length(which(a03_BESTNAMES$n_all_seqs>0))
n_species; n_species_target
```