-
Notifications
You must be signed in to change notification settings - Fork 2
/
Get_OrderSeqs-Metazoan-COI.Rmd
306 lines (284 loc) · 18.8 KB
/
Get_OrderSeqs-Metazoan-COI.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
---
title: "Get_OrderSeqs-Metazoan-COI"
author: "Erin Grey"
date: "2024-03-13"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
```
### DEFINE THE FOLLOWING VARIABLES FOR YOUR ANALYSIS
## locus choices are: (GENES) "ATP6" "ATP8" "COI" "COII" "COIII" "CYTB" "ND1" "ND2" "ND3" "ND4" "ND4L" "ND5" "ND6" and (OTHER FEATURES) "rRNA_12S" "rRNA_16S" "D_loop" "tRNA_Ala" "tRNA_Arg" "tRNA_Asn" "tRNA_Asp" "tRNA_Cys" "tRNA_Gln" "tRNA_Glu" "tRNA_Gly" "tRNA_His" "tRNA_Ile" "tRNA_Leu" "tRNA_Lys" "tRNA_Met" "tRNA_Phe" "tRNA_Pro" "tRNA_Ser" "tRNA_Thr" "tRNA_Trp" "tRNA_Tyr" "tRNA_Val"
## species_list must have "search_name" as the first column and it must be a binomial species name
```{r define_variables}
entrez_key <- "You_Entrez_Key_Here" #GET YOUR OWN ENTREZ KEY AND PUT IT HERE!
locus = "COI" #name of target locus, your choices are
output_folder <- "temp_output" #name of your output folder every time
order_list <- read.csv("workingfiles/MetazoaSpeciesByOrder_2023-09-19.csv") # list of vertebrate orders
```
### LOAD PACKAGES, DEFINE TERM, AND CREATE DATABASE SKELETONS.
```{r load_packages_and_terms}
library(taxizedb) #checks taxonomy
library(rentrez) #queries ENTREZ databases and downloads accessions
library(AnnotationBustR) #finds longest accessions, slice genes from mitogenomes
library(reutils) #other packages need it
library(ape) #convert fasta, fastq, etc.
library(ggplot2) #plots
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
library(Biostrings)
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("genbankr")
library(genbankr) ##genbankr is not functioning on latest Bioconductor, install locally instead
set_entrez_key(entrez_key) #set the Entrez API key
dir.create(output_folder)
# create search terms for ENTREZ
data(mtDNAterms) #AnnotationBustR's list of synonyms for different loci
more_12Ssynonyms <- data.frame(Locus="rRNA_12S", Type="rRNA", Name= "small ribosomal RNA subunit RNA") # other synonyms that we find go here and get added to AnnotationBustR's list
mtDNAterms <- rbind(mtDNAterms, more_12Ssynonyms) #format
is_mtgene <- locus %in% c("ATP6", "ATP8", "COI", "COII", "COIII", "CYTB", "ND1", "ND2", "ND3", "ND4", "ND4L", "ND5", "ND6") #check whether the locus is a gene or other feature, needed for the mitogenome scrape
target_locus_synonyms <- mtDNAterms[mtDNAterms$Locus==locus,] #the target synonyms
target_locus_synonyms$Terms <- paste0("OR ", target_locus_synonyms$Name, "[TITL]") # format for ENTREZ search terms
target_locus_synonyms$Terms[1] <- paste0("AND (", target_locus_synonyms$Name[1], "[TITL]") # first term starts with "AND ("
target_locus_synonyms$Terms[dim(target_locus_synonyms)[1]] <- paste0("OR ", target_locus_synonyms$Name[dim(target_locus_synonyms)[1]], "[TITL])") #last term ends with a ")"
target_locus_searchterm <- paste(as.vector(target_locus_synonyms$Terms), collapse=" ") # the big ENTREZ search term
# create empty dataframes to fill in during loops
tax_df <- as.data.frame(matrix(nrow=1, ncol=9, dimnames=list(NULL, c("tax_query", "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species")))) # taxonomy skeleton
order_seqs <- as.data.frame(matrix(nrow=1, ncol=8, dimnames=list(NULL, c("superkingdom", "kingdom", "phylum", "class", "order", "species_id", "ids_mitogenome","ids_target")))) # missing orders skeleton
a02_REFDB <- data.frame(seq_header=NA, sequence=NA, seq_accession=NA, type=NA, species=NA) #create reference database skeleton
t = 1 # taxonomy for species list loop counter
u = 1 # taxonomy for order sequences loop counter
o = 1 # missing orders loop counter
j = 1 # missing orders species loop counter
i = 1 # ENTREZ search loop counter
s = 1 # sequence scrape loop counter
m = 1 # mitogenome scrape loop counter
z = 1 # output format loop counter
d = 1 # dada2 taxonomy counter
```
## Find up to 5 mitogenomes or target sequences for each orders. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r orders_seqs}
# find sequences for up to five species for each missing order; search mitogenomes first, then regular accessions
while (o <= dim(order_list)[1]) { # for every missing order
cat("\r", "finding ref seqs for order", o, "of", dim(order_list)[1]) #counter
STOPIT <- "NO" # set STOPIT to NO for this order
while (STOPIT == "NO"){ # keep searching as long as STOPIT = "NO"
species_ids_ls <- strsplit(order_list$spp_list[[o]], ";") # get the species ids for that order
if (is.na(species_ids_ls[[1]][1])) {STOPIT <- "YES"} #IF THERE ARE NO GENBANK SPECIES FOR THIS ORDER, STOP SEARCHING FOR IT
if (length(species_ids_ls[[1]])<=5) {
species_ids <- species_ids_ls[[1]] #format species ids
for (j in 1:length(species_ids)){ # for each species
search_name <- paste0("txid",species_ids[j],"[Organism]")
mitogenomes <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) # search mitogenomes
if(class(mitogenomes)[1] != "try-catch") { #if the search went through
if (length(mitogenomes$ids)>0) {
mito_id <- sample(mitogenomes$ids,1) #choose a random mitogenome for this species
row_info <- c(order_list$superkingdom[o], order_list$kingdom[o], order_list$phylum[o], order_list$class[o], order_list$order[o], species_ids[i], mito_id, "NA")
order_seqs <- rbind(order_seqs, row_info)
rm(list=c("mito_id","row_info"))
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
} else {
targets <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, target_locus_searchterm, collapse=" "), retmax=999999))
if(class(targets)[1] != "try-catch") {
if (length(targets$ids)>0) {
target_id <- sample(targets$ids,1) #choose a random accession for this species
row_info <- c(order_list$superkingdom[o], order_list$kingdom[o], order_list$phylum[o], order_list$class[o], order_list$order[o],species_ids[j], "NA", target_id)
order_seqs <- rbind(order_seqs,row_info)
rm(list=c("target_id","row_info"))
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
}
}
rm(targets)
}
rm(search_name); rm(mitogenomes)
}
STOPIT <- "YES"
}
} #If there are 5 or fewer species, search them all for a mitogenome or accession then STOP SEARCHING
if (length(species_ids_ls[[1]])>5){
species_ids <- sample(species_ids_ls[[1]]) #randomize the species for each order
finds = 0 #start with zero finds
for (j in 1:length(species_ids)){
search_name <- paste0("txid",species_ids[j],"[Organism]")
mitogenomes <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) # search mitogenomes
if(class(mitogenomes)[1] != "try-catch") { #if the search went through
if (length(mitogenomes$ids)>0) {
mito_id <- sample(mitogenomes$ids,1) #choose a random mitogenome for this species
row_info <- c(order_list$superkingdom[o], order_list$kingdom[o], order_list$phylum[o], order_list$class[o], order_list$order[o], species_ids[j], mito_id, "NA")
order_seqs <- rbind(order_seqs,row_info)
finds=finds+1 #add to find count
rm(mito_id)
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
} else {
targets <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, target_locus_searchterm, collapse=" "), retmax=999999))
if(class(targets)[1] != "try-catch") {
if (length(targets$ids)>0) {
target_id <- sample(targets$ids,1) #choose a random accession for this species
row_info <- c(order_list$superkingdom[o], order_list$kingdom[o], order_list$phylum[o], order_list$class[o], order_list$order[o],species_ids[j], "NA", target_id)
order_seqs <- rbind(order_seqs,row_info)
finds=finds+1 #add to find count
rm(target_id); rm(row_info)
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
}
}
rm(targets)
}
rm(search_name); rm(mitogenomes)
if (finds >= 3) break
STOPIT <- "YES"
}
} #If there are >=3 species, randomize and search mitogenomes or accessions until you get 3 or have searched them all
}
rm(species_ids_ls)
o=o+1
}
}
order_seqs <- na.omit(order_seqs) #subtract the first row because its all NAs
order_seqs$search_name<-"NA"; order_seqs$n_mitogenome<-"NA"; order_seqs$n_target<-"NA"; order_seqs$species<-"NA"; order_seqs$family<-"NA"; order_seqs$genus<-"NA" #make column names same as a01_NAME for merging
order_seqs$n_mitogenome <- ifelse(order_seqs$ids_mitogenome!="NA",1,0) #update the n_mitogenome column
order_seqs$n_target <- ifelse(order_seqs$ids_target!="NA",1,0) #update the n_target column
```
## Get taxonomy for each species representative for the missing orders.
```{r taxonomy_for_order_seqs}
taxonomies_orderseqs <- taxizedb::classification(order_seqs$species_id, db="ncbi") #get full taxonomies for each species selected to represent missing orders, output is a list-like "classification" object that sucks
#now loop through the sucky classification object to populate the empty dataframe
for (u in 1:length(taxonomies_orderseqs)) {
x <- as.data.frame(taxonomies_orderseqs[u])
if (dim(x)[1]>1) {
c4 <- paste0(x[which(x[,2]=="family"),c(1,3)], collapse="_")
c5 <- paste0(x[which(x[,2]=="genus"),c(1,3)], collapse="_")
c6 <- paste0(x[which(x[,2]=="species"),c(1,3)][1,], collapse="_")
}
if (exists("c4")){
if (length(c4) > 0) {
order_seqs$family[u] <- c4
} else {
order_seqs$family[u] <- NA
}
}
if (exists("c5")){
if (length(c5) > 0) {
order_seqs$genus[u] <- c5
} else {
order_seqs$genus[u] <- NA
}
}
if (exists("c6")){
if (length(c6) > 0) {
order_seqs$species[u] <- c6
} else {
order_seqs$species[u] <- NA
}
}
suppressWarnings(rm(list = c("x", "tax_query", "c4","c5","c6")))
}
order_seqs$search_name <- data.frame(do.call(rbind, strsplit(order_seqs$species, split = "_")))[,1]
```
## Scrape accessions identified above from GenBank.
```{r scrape_accessions}
#skipped 1455 cf. Makaira nigricans/Istiompax indica JFDC-2020 because threw and error
while (s <= dim(order_seqs)[1]){ #for every good species name
cat("\r","scraping accessions for species", s, "of",dim(order_seqs)[1])
ids <- "na"
seqs_target <- "na"
if (order_seqs$n_target[s]>0 && order_seqs$n_target[s]<100) { # scrape GenBank target sequences if available, but skip if >= 100 targets
ids <- c(unlist(strsplit(order_seqs$ids_target[s], split="\\|")))
} else if (order_seqs$n_target[s]>100) { # if more than 200 accessions, randomly select 100
ids <- sample(c(unlist(strsplit(order_seqs$ids_target[s], split="\\|"))),100)
}
if (ids[1] !="na"){ # if there are accessions, fetch them from GenBank
seqs_target <- tryCatch(entrez_fetch(db="nuccore", id=ids, rettype="fasta"))
}
if(class(seqs_target) != "try-catch"){
if (seqs_target != "na"){
write(seqs_target, file.path(output_folder, paste(order_seqs$search_name[s], paste0(locus, ".fasta")))) # formatting - write out the sequences
fasta_target <- readDNAStringSet(file.path(output_folder, paste(order_seqs$search_name[s], paste0(locus, ".fasta"))), format="fasta") #formatting - read them back in as fasta
file.remove(file.path(output_folder, paste(order_seqs$search_name[s], paste0(locus, ".fasta")))) #remove the file to save space
seqs_target_accessions <- entrez_fetch(db="nuccore", id=ids, rettype="acc") # get accession numbers
seq_header <- names(fasta_target) #formatting
sequence <- paste(fasta_target) #formatting
seq_accession <- unlist(strsplit(seqs_target_accessions, split="\n")) # formatting
tempDB <- data.frame(seq_header, sequence, seq_accession, type="accession", species=order_seqs$search_name[s]) # make a temporary database with all sequences, their header, accession number, etc.
a02_REFDB <- rbind(a02_REFDB, tempDB) # append temporary database to the full database
}
s=s+1
}
#reset loop variables
fasta_target<-"na"; seqs_target_accessions<-"na"; seq_header<-"na"; sequence<-"na"; seq_accession<-"na";
tempDB <-"na"
#slow down request to the Entrez server or you'll get kicked out
Sys.sleep(0.5)
}
```
## Scrape your target locus out of mitogenomes identified in the Entrez search. Note - I need to fix some things here (the loop breaks on parsing errors). If you get stuck, just "m=m+1" and re-start loop
```{r scrape_mitogenomes}
## skipped 245, 836, 1166
while (m <= dim(order_seqs)[1]) { #for every good species name
cat("\r","scraping mitogenomes for species", m, "of", dim(order_seqs)[1])
mito_ids <- "na"
if (order_seqs$n_mitogenome[m]>0 && order_seqs$n_mitogenome[m]<20) { #if mitogenomes available and <20
mito_ids <- unlist(strsplit(order_seqs$ids_mitogenome[m], split="\\|")) #format ids
} else if (order_seqs$n_mitogenome[m]>20) { #if >20 mitogenomes, subsample 20 mitogenomes randomly
mito_ids <- sample(unlist(strsplit(order_seqs$ids_mitogenome[m], split="\\|")),20) # format ids
}
if (mito_ids[1] != "na") { #if there are mitogenome ids for species m
mito_accessions <- tryCatch(entrez_fetch(mito_ids, db="nuccore", rettype="acc")) #ask ENTREZ for id's accession
if (class(mito_accessions) != "try-catch"){ #if no api error
mito_accessions <- unlist(strsplit(mito_accessions, split="\n")) #format accession numbers
for (n in 1:length(mito_accessions)){ # loop through and scrape each mitogenome accession
new_row <- c(paste("Unparsed mitochondrion", mito_accessions[n], sep=" "), "na", mito_accessions[n], "scrape", species=order_seqs$search_name[m]) #blank row for each accession
gb <- tryCatch(readGenBank(GBAccession(mito_accessions[n]))) # get the Genbank annotation for accession
if (class(gb) != "try-catch"){ #if no error in getting gb
if (is_mtgene==FALSE) {# if target locus is not a gene, look in otherFeatures()
target_feature <- tryCatch(which(otherFeatures(gb)$product %in% as.character(target_locus_synonyms$Name))) # find target locus annotation metadata
if (class(target_feature) != "try-catch"){
if(length(target_feature) > 0) { # if target feature is found
target_range <- tryCatch(otherFeatures(gb)@ranges[target_feature]) #extract the target range info
if(class(target_range) != "try-catch"){
target_seq <- tryCatch(subseq(getSeq(gb), start=target_range@start, width=target_range@width)) #scrape seq
if(class(target_seq) != "try-catch"){
scraped_seq <- paste(target_seq) #format
new_row <- c(paste(names(target_seq),"mitochondrion", mito_accessions[n], sep=" "), paste(target_seq), mito_accessions[n], "scrape", order_seqs$search_name[m]) #update information
}
}
}
}
}
else if (is_mtgene==TRUE){ # if target locus is a gene, look in gene()
target_feature <- tryCatch(which(genes(gb)$gene %in% as.character(target_locus_synonyms$Name))) # find target locus annotation metadata
if (class(target_feature) != "try-catch"){
if(length(target_feature) > 0) { # if target feature is found
target_range <- tryCatch(genes(gb)@ranges[target_feature]) #extract the target range info
if(class(target_range) != "try-catch"){
target_seq <- tryCatch(subseq(getSeq(gb), start=target_range@start, width=target_range@width)) #scrape seq
if(class(target_seq) != "try-catch"){
scraped_seq <- paste(target_seq) #format
new_row <- c(paste(names(target_seq),"mitochondrion", mito_accessions[n], sep=" "), paste(target_seq), mito_accessions[n], "scrape", order_seqs$search_name[m]) #update information
}
}
}
}
}
}
a02_REFDB <- rbind(a02_REFDB, new_row) # update the database
rm(gb, target_feature, target_range, target_seq, scraped_seq, new_row) # reset n loop variables
Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close n loop (each "n" mitogenome accession per "m" species)
}
}
m=m+1 #update species m variable
rm(mito_ids, mito_accessions) # reset loop variables
Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close for each species m loop
a02_REFDB <- a02_REFDB[-1,] #format - remove the top row of NAs
colnames(order_seqs)[12] <- "species_name_id" #format for merging
order_seqs_taxonomies <- order_seqs[,c(1:5,12,9)]
order_seqs_taxonomies <- order_seqs_taxonomies[!duplicated(order_seqs_taxonomies),]
a03_REFDB <- merge(a02_REFDB, order_seqs_taxonomies, by.x="species", by.y="search_name", all.x=TRUE)
a03_REFDB <- a03_REFDB[which(a03_REFDB$sequence != "na"),]
write.csv(a03_REFDB, file.path(output_folder, paste0("OrderSeqs_Metazoan-", locus,
".csv")), row.names=FALSE)
```