-
Notifications
You must be signed in to change notification settings - Fork 1
/
quick_figuring.Rmd
246 lines (213 loc) · 13.7 KB
/
quick_figuring.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
---
title: "Quick reference library and gap analysis, Erin Grey 2023-10-17"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```
### UPDATE EACH OF THESE FOUR VARIABLES FOR YOUR ANALYSIS
```{r define_variables}
entrez_key <- "7c5ac035201a1835b5a81de1b74ec8613d08" #GET YOUR OWN ENTREZ KEY AND PUT IT HERE!!!!!
species_list <- "data/gom_inverts_2022-11-27.csv" #where your species list lives
locus = "COI" #name of target locus, see workflow notes above for your choices
output_folder <- "gom_inverts_2023-10-17" #name of your output folder every time
order_list <- read.csv("MetazoaSpeciesByOrder.csv") # list of metazoan orders, updated occasionally (see date in file name)
```
##The rest of this script should just run itself. Hardest part is getting the libraries intalled on your computer.
```{r load_packages_and_terms}
library(taxize) #checks taxonomy
library(rentrez) #queries ENTREZ databases and downloads accessions
library(AnnotationBustR) #finds longest accessions, slice genes from mitogenomes
library(reutils) #other packages need it
library(ape) #convert fasta, fastq, etc.
library(Biostrings)
library(genbankr) #parse genbank files
library(ggplot2) #plots
set_entrez_key(entrez_key) #set the Entrez API key
a01_INPUT <- read.csv(species_list, header=TRUE) # load the species list
dir.create(output_folder)
# create ENTREZ Search terms
data(mtDNAterms) #AnnotationBustR's list of synonyms for different loci
more_12Ssynonyms <- data.frame(Locus="rRNA_12S", Type="rRNA", Name= "small ribosomal RNA subunit RNA") # other synonyms that we find go here and get added to AnnotationBustR's list
mtDNAterms <- rbind(mtDNAterms, more_12Ssynonyms) #format
target_locus_synonyms <- mtDNAterms[mtDNAterms$Locus==locus,] #the target synonyms
target_locus_synonyms$Terms <-
paste0("OR ", target_locus_synonyms$Name, "[TITL]") # format for ENTREZ search terms
target_locus_synonyms$Terms[1] <-
paste0("AND (", target_locus_synonyms$Name[1], "[TITL]") # first term starts with "AND ("
target_locus_synonyms$Terms[dim(target_locus_synonyms)[1]] <-
paste0("OR ", target_locus_synonyms$Name[dim(target_locus_synonyms)[1]], "[TITL])") #last term ends with a ")"
target_locus_searchterm <- paste(as.vector(target_locus_synonyms$Terms), collapse=" ") # the big ENTREZ search term
```
```{r name_check}
a02_NAMECHECK <- gnr_resolve(a01_INPUT$source_binomial, best_match_only = TRUE, canonical = TRUE, fields="all") # check animal species names using the Global Names Resolver from the Encyclopedia of Life
a03_BESTNAMES <- merge(a01_INPUT, a02_NAMECHECK[,c("user_supplied_name", "submitted_name","matched_name2")], by.x=c("source_binomial"), by.y=c("user_supplied_name"), all.x=TRUE) #get the best name but keep previous names (fix misspellings, use most recently accepted, etc.)
a03_BESTNAMES$search_name <- ifelse(is.na(a03_BESTNAMES$matched_name2), a03_BESTNAMES$source_binomial, a03_BESTNAMES$matched_name2) # use the check & corrected name if available, if not then use the original source name
a03_BESTNAMES <- a03_BESTNAMES[!duplicated(a03_BESTNAMES$search_name),]
write.csv(a02_NAMECHECK, file.path(output_folder, "a02_NAMECHECK.csv"), row.names=FALSE)
write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v1.csv"), row.names=FALSE)
```
```{r taxonomy}
tax_df <- as.data.frame(matrix(nrow=1, ncol=7))
colnames(tax_df) <- c("db", "query", "phylum", "class", "order", "family", "genus")
for (i in 1443:dim(a03_BESTNAMES)[1]){
x <- tax_name(a03_BESTNAMES$search_name[i], get = c("phylum", "class", "order", "family", "genus"), db = 'ncbi', messages=FALSE)
tax_df <- rbind(tax_df,x)
rm(x)
#slow down request to the Entrez server or you'll get kicked out
Sys.sleep(1)
}
a03_BESTNAMES <- merge(a03_BESTNAMES, tax_df, by.x="search_name", by.y="query", all.x=TRUE)
```
```{r family_seqs}
fams_metazoa <- subset(family_list, kingdom=="Metazoa")
fams_overlap <- met_fams[met_fams$family %in% unique(factor(a03_BESTNAMES$family)),]
fams_missing <- subset(met_fams, !(met_fams$family %in% unique(factor(a03_BESTNAMES$family))))
```
```{r entrez_search}
a03_BESTNAMES$n_mitogenome <- "na" #number of mitogenome accessions
a03_BESTNAMES$n_target <- "na" #number of target accessions
a03_BESTNAMES$ids_mitogenome <- "na" #mitogenome GI numbers
a03_BESTNAMES$ids_target <- "na" #target accession GI numbers
#search ENTREZ nucleotide database ("nucleotide"="nuccore" database)
for (i in 1:dim(a03_BESTNAMES)[1]){
print(i) #counter
# define search terms for species
search_name <- paste0(a03_BESTNAMES$search_name[i],"[ORGN]") #format species name for ENTREZ search
search_term <- paste(paste0(a03_BESTNAMES$search_name[i],"[ORGN]"), target_locus_searchterm, collapse=" ") #concatenate species and 12S search terms into one search term
# search genbank for all accessions, mitogenomes, and target loci accessions
mitogenomes <- entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=999999) # search mitogenome accessions
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
targets <- entrez_search(db="nucleotide", term <- search_term, retmax=999999) # search all species 12S accessions
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
# update the the BESTNAMES dataframe of all accession types and associated ids (= GI numbers)
a03_BESTNAMES$n_mitogenome[i] <-mitogenomes$count
a03_BESTNAMES$n_target[i] <-targets$count
a03_BESTNAMES$ids_mitogenome[i] <-paste(mitogenomes$ids, collapse="|")
a03_BESTNAMES$ids_target[i] <- paste(targets$ids, collapse="|")
# reset loop variables
mitogenomes <- "na"
targets <- "na"
search_name <- "na"
search_term <- "na"
}
write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v2.csv"), row.names = FALSE) # write out the mitogenome and target accessions for each species
```
```{r scrape_targets}
a04_REFDB <- data.frame(seq_header=NA, sequence=NA, seq_accession=NA, type=NA, species=NA) #create the database skeleton
a03_BESTNAMES$n_target <- as.numeric(a03_BESTNAMES$n_target) #format
# add target accession sequences to database
for (j in 1:dim(a03_BESTNAMES)[1]){ #for every good species name
ids <- "na"
seqs_target <- "na"
print(j) #counter
if (a03_BESTNAMES$n_target[j]>0 && a03_BESTNAMES$n_target[j]<100) { # scrape GenBank target sequences if available, but skip if >= 100 targets
ids <- c(unlist(strsplit(a03_BESTNAMES$ids_target[j], split="\\|")))
} else if (a03_BESTNAMES$n_target[j]>100) { # if more than 200 accessions, randomly select 100
ids <- sample(c(unlist(strsplit(a03_BESTNAMES$ids_target[j], split="\\|"))),100)
}
if (ids[1] !="na"){ # if there are accessions, fetch them from GenBank
seqs_target <- entrez_fetch(db="nuccore", id=ids, rettype="fasta")
}
if (seqs_target != "na"){
write(seqs_target, file.path(output_folder, paste(a03_BESTNAMES$search_name[j], paste0(locus, ".fasta")))) # formatting - write out the sequences
fasta_target <- readDNAStringSet(file.path(output_folder, paste(a03_BESTNAMES$search_name[j], paste0(locus, ".fasta"))), format="fasta") #formatting - read them back in as fasta
seqs_target_accessions <- entrez_fetch(db="nuccore", id=ids, rettype="acc") # get accession numbers
seq_header <- names(fasta_target) #formatting
sequence <- paste(fasta_target) #formatting
seq_accession <- unlist(strsplit(seqs_target_accessions, split="\n")) # formatting
tempDB <- data.frame(seq_header, sequence, seq_accession, type="accession", species=a03_BESTNAMES$search_name[j]) # make a temporary database with all sequences, their header, accession number, etc.
a04_REFDB <- rbind(a04_REFDB, tempDB) # append temporary database to the full database
}
#reset loop variables
ids <- "na"
seqs_target <- "na"
fasta_target <- "na"
seqs_target_accessions <- "na"
seq_header <- "na"
sequence <- "na"
seq_accession <- "na"
tempDB <- "na"
#slow down request to the Entrez server or you'll get kicked out
Sys.sleep(1)
}
```
```{r scrape_mitogenomes}
a03_BESTNAMES$n_mitogenome <- as.numeric(a03_BESTNAMES$n_mitogenome) #format
for (k in 368:dim(a03_BESTNAMES)[1]) { #for every good species name
print(k) #counter
mito_ids <- "na"
if (a03_BESTNAMES$n_mitogenome[k]>0 && a03_BESTNAMES$n_mitogenome[k]<20) { #scrape Genbank mitogenomes if available, skip species with lots of mitogenomes bc server doesn't allow this
mito_ids <- unlist(strsplit(a03_BESTNAMES$ids_mitogenome[k], split="\\|")) # format mitogenome ids
} else if (a03_BESTNAMES$n_mitogenome[k]>20) {
mito_ids <- sample(unlist(strsplit(a03_BESTNAMES$ids_mitogenome[k], split="\\|")),20) # format mitogenome ids
}
if (mito_ids[1] != "na") {
mito_accessions <- entrez_fetch(mito_ids, db="nuccore", rettype="acc") # find the accession number for each mitogenome id
mito_accessions <- unlist(strsplit(mito_accessions, split="\n")) # format accession numbers
for (m in 1:length(mito_accessions)){ # loop through and scrape each mitogenome accession
gb <- readGenBank(GBAccession(mito_accessions[m])) # get the Genbank annotation for accession
target_feature <- which(genes(gb)$gene %in% as.character(target_locus_synonyms$Name)) # find target annotation metadata (note: use otherFeatures for for rRNAs, tRNAs, etc. and use genes(gb) for genes like COI, CYTB, NADH, etc.)
new_row <- c(paste("Unparsed mitochondrion", mito_accessions[m], sep=" "), "na", mito_accessions[m], "scrape", species=a03_BESTNAMES$search_name[k])
if(length(target_feature) > 0) { # if target feature is found in the parsed mitochondrial genome, find the sequence, otherwise say that its unparsed
target_range <- genes(gb)@ranges[target_feature][1] #extract the target range information, just use the first one
target_strand <- genes(gb)@strand[target_feature[1]] #extract the target strand information (+ or -)
target_seq <- subseq(getSeq(gb), start=target_range@start, width=target_range@width) #scrape the genome for target
scrapedseq_binomial <- names(target_seq) #get the binomial name
scraped_seq <- paste(target_seq) #format
scraped_range <- paste(target_range) #format
new_row <- c(paste(names(target_seq),"mitochondrion", mito_accessions[m], sep=" "), paste(target_seq), mito_accessions[m], "scrape", species=a03_BESTNAMES$search_name[k])
}
a04_REFDB <- rbind(a04_REFDB, new_row) # update the database
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
# reset loop variables
rm(gb, target_feature, target_strand, target_seq, scrapedseq_binomial, scraped_seq, scraped_range, new_row)
Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close m loop (each "m" accession m per species "k")
}
# reset loop variables
rm(mito_ids, mito_accessions)
Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close for each species k loop
a04_REFDB <- a04_REFDB[-1,] #format - remove the top row of NAs
write.csv(a04_REFDB, file.path(output_folder, "a04_REFDB.csv"), row.names=FALSE)
```
```{r remove_duplicates}
a05_UNIQUEDB <- a04_REFDB[!duplicated(a04_REFDB$sequence),] #remove duplicates
for (i in 1:dim(a05_UNIQUEDB)[1]){ # for every row in the unique db file
dups <- subset(a04_REFDB, sequence == a05_UNIQUEDB$sequence[i]) # find identical sequences in the fill db file
a05_UNIQUEDB$duplicate_accessions[i] <- paste(dups$seq_accession, collapse = "|") # paste all those accessions together into a new unique db field
a05_UNIQUEDB$duplicate_species[i] <- paste(dups[!duplicated(dups$species),"species"], collapse = "|")
}
write.csv(a05_UNIQUEDB, file.path(output_folder, "a05_UniqueRefDB.csv"), row.names=FALSE)
```
```{r update_species_summary}
#Count reference sequences by species
counts_species <- as.data.frame(table(a05_UNIQUEDB$species)) #count the number of unique reference sequences per species
colnames(counts_species) <- c("species", "n_unique_seqs") #format
a03_BESTNAMES <- merge(a03_BESTNAMES, counts_species, by.x= "search_name", by.y="species", all.x=TRUE, all.y=TRUE) # update the summary database with reference sequence counts
#add in duplicates information to BESTNAMES
a03_BESTNAMES$dup_accessions <- "na"
a03_BESTNAMES$dup_species <- "na"
a03_BESTNAMES$dup_species_n <- "na"
for (b in 1:dim(a03_BESTNAMES)[1]){
dup_accessions <- paste(a05_UNIQUEDB[a05_UNIQUEDB$species == a03_BESTNAMES$search_name[b], "duplicate_accessions"], collapse="|")
dup_accessions <- unlist(strsplit(dup_accessions, split="\\|"))
dup_accessions_unique <- paste(unique(dup_accessions), collapse="|")
dup_species <- paste(a05_UNIQUEDB[a05_UNIQUEDB$species == a03_BESTNAMES$search_name[b], "duplicate_species"], collapse="|")
dup_species <- unlist(strsplit(dup_species, split="\\|"))
dup_species_unique <- unique(dup_species)
if (length(dup_accessions)>0){
a03_BESTNAMES$dup_accessions[b]<-dup_accessions_unique
a03_BESTNAMES$dup_species[b]<- paste(dup_species_unique, collapse="|")
a03_BESTNAMES$dup_species_n[b]<- length(dup_species_unique)
}
#reset variables
rm(dup_accessions)
rm(dup_species)
}
a03_BESTNAMES$n_all_seqs <- a03_BESTNAMES$n_mitogenome + a03_BESTNAMES$n_target
write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v3.csv"), row.names=FALSE)
n_species <- dim(a03_BESTNAMES)[1]; n_species_target <- length(which(a03_BESTNAMES$n_all_seqs>0))
n_species; n_species_target
```