-
Notifications
You must be signed in to change notification settings - Fork 1
/
GOM_Metazoan_COI_script.Rmd
513 lines (460 loc) · 27.2 KB
/
GOM_Metazoan_COI_script.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
---
title: "Quick reference library for Emily Lancaster, GOM Invertebrates COI, Erin Grey 2023-10-20"
output: html_document
---
```{r setup, include=FALSE}
rm(list=ls()) # clear working environment
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```
### UPDATE EACH OF THESE FOUR VARIABLES FOR YOUR ANALYSIS
```{r define_variables}
entrez_key <- "7c5ac035201a1835b5a81de1b74ec8613d08" #GET YOUR OWN ENTREZ KEY AND PUT IT HERE!
locus = "COI" #name of target locus, see workflow notes above for your choices
output_folder <- "gom_inverts_2023-10-17" #name of your output folder every time
species_list <- read.csv("gom_inverts_2023-10-17/a03_BESTNAMES_v1.csv") # your species list
order_list <- read.csv("MetazoaSpeciesByOrder_2023-09-19.csv") # list of metazoan orders
```
### LOAD PACKAGES, DEFINE TERM, AND CREATe DATABASE SKELETONS. NOTE - LOADING PACKAGES CAN BE TRICKY
```{r load_packages_and_terms}
library(taxizedb) #checks taxonomy
library(rentrez) #queries ENTREZ databases and downloads accessions
library(AnnotationBustR) #finds longest accessions, slice genes from mitogenomes
library(reutils) #other packages need it
library(ape) #convert fasta, fastq, etc.
library(Biostrings)
library(genbankr) #parse genbank files
library(ggplot2) #plots
set_entrez_key(entrez_key) #set the Entrez API key
dir.create(output_folder)
# create ENTREZ Search terms
data(mtDNAterms) #AnnotationBustR's list of synonyms for different loci
more_12Ssynonyms <- data.frame(Locus="rRNA_12S", Type="rRNA", Name= "small ribosomal RNA subunit RNA") # other synonyms that we find go here and get added to AnnotationBustR's list
mtDNAterms <- rbind(mtDNAterms, more_12Ssynonyms) #format
target_locus_synonyms <- mtDNAterms[mtDNAterms$Locus==locus,] #the target synonyms
target_locus_synonyms$Terms <-
paste0("OR ", target_locus_synonyms$Name, "[TITL]") # format for ENTREZ search terms
target_locus_synonyms$Terms[1] <-
paste0("AND (", target_locus_synonyms$Name[1], "[TITL]") # first term starts with "AND ("
target_locus_synonyms$Terms[dim(target_locus_synonyms)[1]] <-
paste0("OR ", target_locus_synonyms$Name[dim(target_locus_synonyms)[1]], "[TITL])") #last term ends with a ")"
target_locus_searchterm <- paste(as.vector(target_locus_synonyms$Terms), collapse=" ") # the big ENTREZ search term
# create empty dataframes to fill in during loops
tax_df <- as.data.frame(matrix(nrow=1, ncol=7, dimnames=list(NULL, c("tax_query", "phylum", "class", "order", "family", "genus", "species")))) # create taxonomy skeleton
tax_df2 <- as.data.frame(matrix(nrow=1, ncol=7, dimnames=list(NULL, c("tax_query", "phylum", "class", "order", "family", "genus", "species")))) # create taxonomy skeleton
order_reps <- as.data.frame(matrix(nrow=1, ncol=6, dimnames=list(NULL, c("phylum", "class", "order", "species_id", "ids_mitogenome","ids_target")))) # create missing orders representatives skeleton
a04_REFDB <- data.frame(seq_header=NA, sequence=NA, seq_accession=NA, type=NA, species=NA) #create reference database skeleton
t = 1 # taxonomy loop counter
o = 1 # missing orders loop counter
j = 1 # missing orders species loop counter
i = 1 # ENTREZ search loop counter
s = 1 # sequence scrape loop counter
m = 1 # mitogenome scrape loop counter
z = 1 # output format loop counter
d = 1 # dada2 taxonomy counter
```
## The rest of this script should just run itself. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r taxonomy}
taxonomies_cls <- taxizedb::classification(species_list[,5], db="ncbi")#get full taxonomies for each species, output is a list-like "classification" object that sucks
#loop through the sucky classification object to populate the empty dataframe
for (t in 1:length(taxonomies_cls)) {
x <- as.data.frame(taxonomies_cls[t])
tax_query <- strsplit(colnames(x)[1], split="[.]")
tax_query <- paste(tax_query[[1]][1], tax_query[[1]][2], sep=" ")
# update the taxonomy dataframe skeleton
tax_df[nrow(tax_df)+1,] <- rep("NA", dim(tax_df)[2])
tax_df$tax_query[t] <- tax_query
if (dim(x)[1]>1) {
c1 <- paste0(x[which(x[,2]=="phylum"),c(1,3)], collapse="_")
c2 <- paste0(x[which(x[,2]=="class"),c(1,3)], collapse="_")
c3 <- paste0(x[which(x[,2]=="order"),c(1,3)], collapse="_")
c4 <- paste0(x[which(x[,2]=="family"),c(1,3)], collapse="_")
c5 <- paste0(x[which(x[,2]=="genus"),c(1,3)], collapse="_")
c6 <- paste0(x[which(x[,2]=="species"),c(1,3)][1,], collapse="_")
}
if (exists("c1")){
if (length(c1) > 0) {
tax_df$phylum[t] <- c1
} else {
tax_df$phylum[t] <- "na"
}
}
if (exists("c2")){
if (length(c2) > 0) {
tax_df$class[t] <- c2
} else {
tax_df$class[t] <- "na"
}
}
if (exists("c3")){
if (length(c3) > 0) {
tax_df$order[t] <- c3
} else {
tax_df$order[t] <- "na"
}
}
if (exists("c4")){
if (length(c4) > 0) {
tax_df$family[t] <- c4
} else {
tax_df$family[t] <- "na"
}
}
if (exists("c5")){
if (length(c5) > 0) {
tax_df$genus[t] <- c5
} else {
tax_df$genus[t] <- "na"
}
}
if (exists("c6")){
if (length(c6) > 0) {
tax_df$species[t] <- c6
} else {
tax_df$species[t] <- "na"
}
}
suppressWarnings(rm(list = c("x", "tax_query", "c1","c2","c3","c4","c5","c6")))
}
a03_BESTNAMES <- merge(species_list, tax_df, by.x="search_name", by.y="tax_query", all.x=TRUE)
a03_BESTNAMESwithOrderTax <- droplevels(subset(a03_BESTNAMES, !is.na(a03_BESTNAMES$order))) # species with taxonomies
orders_missing <- droplevels(subset(order_list, !(order_list$order %in% unique(factor(a03_BESTNAMESwithOrderTax$order))))) # orders that are missing from species list
row.names(orders_missing) = NULL # format
```
## Find up to 3 mitogenomes or target sequences for missing orders. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r missing_orders_seqs}
# find sequences for up to three species for each missing order; search mitogenomes first, then regular accessions
while (o <= dim(orders_missing)[1]) { # for every missing order
cat("\r", "finding ref seqs for order", o, "of", dim(orders_missing)[1]) #counter
STOPIT <- "NO" # set STOPIT to NO for this order
while (STOPIT == "NO"){ # keep searching as long as STOPIT = "NO"
species_ids_ls <- strsplit(orders_missing$spp_list[[o]], ";") # get the species ids for that order
if (is.na(species_ids_ls[[1]][1])) {STOPIT <- "YES"} #IF THERE ARE NO GENBANK SPECIES FOR THIS ORDER, STOP SEARCHING FOR IT
if (length(species_ids_ls[[1]])<=3) {
species_ids <- species_ids_ls[[1]] #format species ids
for (j in 1:length(species_ids)){ # for each species
search_name <- paste0("txid",species_ids[j],"[Organism]")
mitogenomes <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) # search mitogenomes
if(class(mitogenomes)[1] != "try-catch") { #if the search went through
if (length(mitogenomes$ids)>0) {
mito_id <- sample(mitogenomes$ids,1) #choose a random mitogenome for this species
row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o], species_ids[i], mito_id, "NA")
order_reps <- rbind(order_reps, row_info)
rm(list=c("mito_id","row_info"))
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
} else {
targets <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, target_locus_searchterm, collapse=" "), retmax=999999))
if(class(targets)[1] != "try-catch") {
if (length(targets$ids)>0) {
target_id <- sample(targets$ids,1) #choose a random accession for this species
row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o],species_ids[j], "NA", target_id)
order_reps <- rbind(order_reps,row_info)
rm(list=c("target_id","row_info"))
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
}
}
rm(targets)
}
rm(search_name); rm(mitogenomes)
}
STOPIT <- "YES"
}
} #If there are 3 or fewer species, search them all for a mitogenome or accession then STOP SEARCHING
if (length(species_ids_ls[[1]])>3){
species_ids <- sample(species_ids_ls[[1]]) #randomize the species for each order
finds = 0 #start with zero finds
for (j in 1:length(species_ids)){
search_name <- paste0("txid",species_ids[j],"[Organism]")
mitogenomes <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) # search mitogenomes
if(class(mitogenomes)[1] != "try-catch") { #if the search went through
if (length(mitogenomes$ids)>0) {
mito_id <- sample(mitogenomes$ids,1) #choose a random mitogenome for this species
row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o], species_ids[j], mito_id, "NA")
order_reps <- rbind(order_reps,row_info)
finds=finds+1 #add to find count
rm(mito_id)
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
} else {
targets <- tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, target_locus_searchterm, collapse=" "), retmax=999999))
if(class(targets)[1] != "try-catch") {
if (length(targets$ids)>0) {
target_id <- sample(targets$ids,1) #choose a random accession for this species
row_info <- c(orders_missing$phylum[o], orders_missing$class[o], orders_missing$order[o],species_ids[j], "NA", target_id)
order_reps <- rbind(order_reps,row_info)
finds=finds+1 #add to find count
rm(target_id); rm(row_info)
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
}
}
rm(targets)
}
rm(search_name); rm(mitogenomes)
if (finds >= 3) break
STOPIT <- "YES"
}
} #If there are >=3 species, randomize and search mitogenomes or accessions until you get 3 or have searched them all
}
rm(species_ids_ls)
o=o+1
}
}
order_reps <- na.omit(order_reps) #substract the first row because its all NAs
order_reps$search_name <- "NA"; order_reps$source_binomial<- "NA";order_reps$source<-"NA"; order_reps$submitted_name<-"NA"; order_reps$matched_name2<-"NA"; order_reps$n_mitogenome<-"NA"; order_reps$n_target<-"NA"; order_reps$species<-"NA"; order_reps$family<-"NA"; order_reps$genus<-"NA" #make column names same as a)# best names for merging
order_reps$n_mitogenome <- ifelse(order_reps$ids_mitogenome!="NA",1,0) #update the n_mitogenome column
order_reps$n_target <- ifelse(order_reps$ids_target!="NA",1,0) #update the n_target column
```
## Search Entrez for mitogenome and target sequence for your species. If you run into an HTTP 500 error, just start the loop again and you should be good.
```{r search_entrez}
while (i <= dim(a03_BESTNAMES)[1]){
cat("\r", "finding Genbank accessions & mitogenomes for species", i, "of", dim(a03_BESTNAMES)[1]) #counter
# define search terms for species
search_name <- paste0(a03_BESTNAMES$search_name[i],"[ORGN]") #format species name for ENTREZ search
search_term <- paste(search_name, target_locus_searchterm, collapse=" ") #mash species & locus terms into one search term
# search ENTREZ for mitogenomes and then for accessions
tryCatch(entrez_search(db="nucleotide", term <- paste(search_name, "AND mitochondrion[TITL] AND complete genome[TITL]"), retmax=9999)) -> mitogenomes # search mitogenomes
if(class(mitogenomes)[1] != "try-catch") {
a03_BESTNAMES$n_mitogenome[i] <-mitogenomes$count #add mitogenome count names dataframe
a03_BESTNAMES$ids_mitogenome[i] <-paste(mitogenomes$ids, collapse="|") #add mitogenome ids to name dataframe
tryCatch(entrez_search(db="nucleotide", term <- search_term, retmax=9999)) -> targets # search accessions
if(class(targets)[1] != "try-catch") {
a03_BESTNAMES$n_target[i] <-targets$count
a03_BESTNAMES$ids_target[i] <- paste(targets$ids, collapse="|")
}
}
Sys.sleep(1) #slow down request to the Entrez server or you'll get kicked out
# reset loop variables
mitogenomes <- "na"
targets <- "na"
search_name <- "na"
search_term <- "na"
i <- i + 1
}
a03_BESTNAMES$species_id <- "NA" #make similar to order_reps for merging
orders_rep <- order_reps[,colnames(a03_BESTNAMES)] # format the order_reps dataframe to add to the a03_BESTNAMES datafame
a03_BESTNAMES<- rbind(a03_BESTNAMES, order_reps) # add the order_reps dataframe to the a03BESTANAMES dataframe
a03_BESTNAMES$n_target <- as.numeric(a03_BESTNAMES$n_target) #format
a03_BESTNAMES$n_mitogenome <- as.numeric(a03_BESTNAMES$n_mitogenome) #format
row.names(a03_BESTNAMES) = NULL # format
write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v2.csv"), row.names = FALSE) # save the mitogenome and target accessions ids for scraping
```
```{r scrape_accessions}
while (s <= dim(a03_BESTNAMES)[1]){ #for every good species name
cat("\r","scraping sequences for species", s, "of",dim(a03_BESTNAMES)[1])
ids <- "na"
seqs_target <- "na"
if (a03_BESTNAMES$n_target[s]>0 && a03_BESTNAMES$n_target[s]<100) { # scrape GenBank target sequences if available, but skip if >= 100 targets
ids <- c(unlist(strsplit(a03_BESTNAMES$ids_target[s], split="\\|")))
} else if (a03_BESTNAMES$n_target[s]>100) { # if more than 200 accessions, randomly select 100
ids <- sample(c(unlist(strsplit(a03_BESTNAMES$ids_target[s], split="\\|"))),100)
}
if (ids[1] !="na"){ # if there are accessions, fetch them from GenBank
seqs_target <- tryCatch(entrez_fetch(db="nuccore", id=ids, rettype="fasta"))
}
if(class(seqs_target) != "try-catch"){
if (seqs_target != "na"){
write(seqs_target, file.path(output_folder, paste(a03_BESTNAMES$search_name[s], paste0(locus, ".fasta")))) # formatting - write out the sequences
fasta_target <- readDNAStringSet(file.path(output_folder, paste(a03_BESTNAMES$search_name[s], paste0(locus, ".fasta"))), format="fasta") #formatting - read them back in as fasta
seqs_target_accessions <- entrez_fetch(db="nuccore", id=ids, rettype="acc") # get accession numbers
seq_header <- names(fasta_target) #formatting
sequence <- paste(fasta_target) #formatting
seq_accession <- unlist(strsplit(seqs_target_accessions, split="\n")) # formatting
tempDB <- data.frame(seq_header, sequence, seq_accession, type="accession", species=a03_BESTNAMES$search_name[s]) # make a temporary database with all sequences, their header, accession number, etc.
a04_REFDB <- rbind(a04_REFDB, tempDB) # append temporary database to the full database
}
s=s+1
}
#reset loop variables
fasta_target<-"na"; seqs_target_accessions<-"na"; seq_header<-"na"; sequence<-"na"; seq_accession<-"na";
tempDB <-"na"
#slow down request to the Entrez server or you'll get kicked out
Sys.sleep(1)
}
```
```{r scrape_mitogenomes}
while (m <= dim(a03_BESTNAMES)[1]) { #for every good species name
cat("\r","scraping mitogenomes for species", m, "of", dim(a03_BESTNAMES)[1])
mito_ids <- "na"
if (a03_BESTNAMES$n_mitogenome[m]>0 && a03_BESTNAMES$n_mitogenome[m]<20) { #if mitogenomes available and <20
mito_ids <- unlist(strsplit(a03_BESTNAMES$ids_mitogenome[m], split="\\|")) #format ids
} else if (a03_BESTNAMES$n_mitogenome[m]>20) { #if >20 mitogenomes, sample 20 randomly
mito_ids <- sample(unlist(strsplit(a03_BESTNAMES$ids_mitogenome[m], split="\\|")),20) # format ids
}
if (mito_ids[1] != "na") { #if there are mitogenome ids for species m
mito_accessions <- tryCatch(entrez_fetch(mito_ids, db="nuccore", rettype="acc")) #get accessions for each id
if (class(mito_accessions) != "try-catch"){ #if no api error
mito_accessions <- unlist(strsplit(mito_accessions, split="\n")) #format accession numbers
for (n in 1:length(mito_accessions)){ # loop through and scrape each mitogenome accession
gb <- readGenBank(GBAccession(mito_accessions[n])) # get the Genbank annotation for accession
target_feature <- which(genes(gb)$gene %in% as.character(target_locus_synonyms$Name)) # find target locus annotation metadata (note: use otherFeatures for for rRNAs, tRNAs, etc. and use genes(gb) for genes like COI, CYTB, NADH, etc.)
new_row <- c(paste("Unparsed mitochondrion", mito_accessions[n], sep=" "), "na", mito_accessions[n], "scrape", species=a03_BESTNAMES$search_name[m])
if(length(target_feature) > 0) { # if target feature is found
target_range <- genes(gb)@ranges[target_feature][1] #extract the target range info
target_strand <- genes(gb)@strand[target_feature[1]] #extract the target strand info (+ or -)
target_seq <- subseq(getSeq(gb), start=target_range@start, width=target_range@width) #scrape seq
scrapedseq_binomial <- names(target_seq) #get the binomial name
scraped_seq <- paste(target_seq) #format
scraped_range <- paste(target_range) #format
new_row <- c(paste(names(target_seq),"mitochondrion", mito_accessions[n], sep=" "), paste(target_seq), mito_accessions[n], "scrape", species=a03_BESTNAMES$search_name[m]) #update information
}
a04_REFDB <- rbind(a04_REFDB, new_row) # update the database
rm(gb, target_feature, target_strand, target_seq, scrapedseq_binomial, scraped_seq, scraped_range, new_row) # reset n loop variables
Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close n loop (each "n" mitogenome accession per "m" species)
}
}
m=m+1 #update species m variable
rm(mito_ids, mito_accessions) # reset loop variables
Sys.sleep(0.5) #slow down request to the Entrez server or you'll get kicked out
} # close for each species m loop
a04_REFDB <- a04_REFDB[-1,] #format - remove the top row of NAs
a04_REFDB_unparsed <- subset(a04_REFDB, sequence == "na")
a04_REFDB_parsed <- subset(a04_REFDB, sequence != "na")
write.csv(a04_REFDB_parsed, file.path(output_folder, "a04_REFDB.csv"), row.names=FALSE)
```
```{r remove_duplicates}
a05_UNIQUEDB <- a04_REFDB[!duplicated(a04_REFDB$sequence),] #remove duplicates
for (i in 1:dim(a05_UNIQUEDB)[1]){ # for every row in the unique db file
dups <- subset(a04_REFDB, sequence == a05_UNIQUEDB$sequence[i]) # find identical sequences in the fill db file
a05_UNIQUEDB$duplicate_accessions[i] <- paste(dups$seq_accession, collapse = "|") # paste all those accessions together into a new unique db field
a05_UNIQUEDB$duplicate_species[i] <- paste(dups[!duplicated(dups$species),"species"], collapse = "|")
}
row.names(a05_UNIQUEDB) <- NULL #format
write.csv(a05_UNIQUEDB, file.path(output_folder, "a05_UniqueRefDB.csv"), row.names=FALSE)
```
```{r format_dada2}
a05_UNIQUEDB_format <- a05_UNIQUEDB #find seqs that are missing species names
row.names(a05_UNIQUEDB_format) <- NULL #format
for (z in 1:dim(a05_UNIQUEDB_format)[1]){
name_string <- strsplit(a05_UNIQUEDB_format$seq_header[z], " ")[[1]]
if (a05_UNIQUEDB_format$type[z]=="scrape"){
a05_UNIQUEDB_format$species[z] <- paste(name_string[1], name_string[2], sep=" ")
} else {
if(a05_UNIQUEDB_format$type[z]=="accession"){
a05_UNIQUEDB_format$species[z] <- paste(name_string[2], name_string[3], sep=" ")
}
}
rm(name_string)
}
a03_BESTNAMES$search_name <- as.character(a03_BESTNAMES$search_name) #format
a05_UNIQUEDB_format$species <- as.character(a05_UNIQUEDB_format$species) #format
write.csv(a05_UNIQUEDB, file.path(output_folder, "a05_UniqueRefDB.csv"), row.names=FALSE)
a06_dadaDB <- merge(x=a05_UNIQUEDB_format, y=a03_BESTNAMES, by.x="species", by.y="search_name", all.x=TRUE, all.y=FALSE)
```
```{r taxonomy_dada2}
taxonomies_all_cls <- taxizedb::classification(a06_dadaDB[,1], db="ncbi")#get full taxonomies for each species, output is a list-like "classification" object that sucks
#loop through the sucky classification object to populate the empty dataframe
for (d in 1:length(taxonomies_all_cls)) {
x <- as.data.frame(taxonomies_all_cls[d])
tax_query <- strsplit(colnames(x)[1], split="[.]")
tax_query <- paste(tax_query[[1]][1], tax_query[[1]][2], sep=" ")
# update the taxonomy dataframe skeleton
tax_df2[nrow(tax_df2)+1,] <- rep("NA", dim(tax_df2)[2])
tax_df2$tax_query[d] <- tax_query
if (dim(x)[1]>1) {
c1 <- paste0(x[which(x[,2]=="phylum"),c(1,3)], collapse="_")
c2 <- paste0(x[which(x[,2]=="class"),c(1,3)], collapse="_")
c3 <- paste0(x[which(x[,2]=="order"),c(1,3)], collapse="_")
c4 <- paste0(x[which(x[,2]=="family"),c(1,3)], collapse="_")
c5 <- paste0(x[which(x[,2]=="genus"),c(1,3)], collapse="_")
c6 <- paste0(x[which(x[,2]=="species"),c(1,3)][1,], collapse="_")
}
if (exists("c1")){
if (length(c1) > 0) {
tax_df2$phylum[d] <- c1
} else {
tax_df2$phylum[d] <- "na"
}
}
if (exists("c2")){
if (length(c2) > 0) {
tax_df2$class[d] <- c2
} else {
tax_df2$class[d] <- "na"
}
}
if (exists("c3")){
if (length(c3) > 0) {
tax_df2$order[d] <- c3
} else {
tax_df2$order[d] <- "na"
}
}
if (exists("c4")){
if (length(c4) > 0) {
tax_df2$family[d] <- c4
} else {
tax_df2$family[d] <- "na"
}
}
if (exists("c5")){
if (length(c5) > 0) {
tax_df2$genus[d] <- c5
} else {
tax_df2$genus[d] <- "na"
}
}
if (exists("c6")){
if (length(c6) > 0) {
tax_df2$species[d] <- c6
} else {
tax_df2$species[d] <- "na"
}
}
suppressWarnings(rm(list = c("x", "tax_query", "c1","c2","c3","c4","c5","c6")))
}
tax_df2 <- tax_df2[!duplicated(tax_df2$tax_query),] #remove duplicated taxa
tax_df2$tax_query <- as.character(tax_df2$tax_query) #format for merging
a06_dadaDB$species <- as.character(a06_dadaDB$species) #format for merging
a06_dadaDB_tax <- merge(a06_dadaDB, tax_df2, by.x="species", by.y="tax_query", all.x=TRUE) #merge taxonomic information into ref database
a06_dadaDB_tax[a06_dadaDB_tax=="NA"] <- NA
colnames(a06_dadaDB_tax)[17] <- "species.x"
tax_fails <- a06_dadaDB_tax[is.na(a06_dadaDB_tax$order.x) & is.na(a06_dadaDB_tax$order.y),] #rows that are missing all higher-level taxonomic information
a06_dadaDB_tax_clean1 <- a06_dadaDB_tax[!(is.na(a06_dadaDB_tax$order.x) & is.na(a06_dadaDB_tax$order.y)),] # remove tax_fails
a06_dadaDB_tax_clean2 <- a06_dadaDB_tax_clean1
a06_dadaDB_tax_clean2$phylum = a06_dadaDB_tax_clean2$phylum.y
a06_dadaDB_tax_clean2$class = a06_dadaDB_tax_clean2$class.y
a06_dadaDB_tax_clean2$order = a06_dadaDB_tax_clean2$order.y
a06_dadaDB_tax_clean2$family = a06_dadaDB_tax_clean2$family.y
a06_dadaDB_tax_clean2$genus = a06_dadaDB_tax_clean2$genus.y
a06_dadaDB_tax_clean2$species = a06_dadaDB_tax_clean2$species.y
a06_dadaDB_tax_clean2$phylum[is.na(a06_dadaDB_tax_clean2$phylum.y)] <- a06_dadaDB_tax_clean2$phylum.x[is.na(a06_dadaDB_tax_clean2$phylum.y)]
a06_dadaDB_tax_clean2$order[is.na(a06_dadaDB_tax_clean2$order.y)] <- a06_dadaDB_tax_clean2$order.x[is.na(a06_dadaDB_tax_clean2$order.y)]
a06_dadaDB_tax_clean2$class[is.na(a06_dadaDB_tax_clean2$class.y)] <- a06_dadaDB_tax_clean2$class.x[is.na(a06_dadaDB_tax_clean2$class.y)]
a06_dadaDB_tax_clean2$family[is.na(a06_dadaDB_tax_clean2$family.y)] <- a06_dadaDB_tax_clean2$family.x[is.na(a06_dadaDB_tax_clean2$family.y)]
a06_dadaDB_tax_clean2$genus[is.na(a06_dadaDB_tax_clean2$genus.y)] <- a06_dadaDB_tax_clean2$genus.x[is.na(a06_dadaDB_tax_clean2$genus.y)]
a06_dadaDB_tax_clean2$species[is.na(a06_dadaDB_tax_clean2$species.y)] <- a06_dadaDB_tax_clean2$species.x[is.na(a06_dadaDB_tax_clean2$species.y)]
my_cols <- c("phylum", "class","order","family","genus","species")
a06_dadaDB_tax_clean2$header <- do.call(paste, c(a06_dadaDB_tax_clean2[my_cols], sep = ";"))
a06_dadaDB_tax_clean2$header2 <- paste(">",a06_dadaDB_tax_clean2$header, sep="")
REFDB_dada2 <- a06_dadaDB_tax_clean2[,c("header2","sequence")]
write.table(REFDB_dada2, "REFDB_dada2.fasta", sep="\n", col.names=FALSE, row.names=FALSE, quote = FALSE)
```
```{r update_species_summary}
#Count reference sequences by species
counts_species <- as.data.frame(table(a05_UNIQUEDB$species)) #count the number of unique reference sequences per species
colnames(counts_species) <- c("species", "n_unique_seqs") #format
a03_BESTNAMES <- merge(a03_BESTNAMES, counts_species, by.x= "search_name", by.y="species", all.x=TRUE, all.y=TRUE) # update the summary database with reference sequence counts
#add in duplicates information to BESTNAMES
a03_BESTNAMES$dup_accessions <- "na"
a03_BESTNAMES$dup_species <- "na"
a03_BESTNAMES$dup_species_n <- "na"
for (b in 1:dim(a03_BESTNAMES)[1]){
dup_accessions <- paste(a05_UNIQUEDB[a05_UNIQUEDB$species == a03_BESTNAMES$search_name[b], "duplicate_accessions"], collapse="|")
dup_accessions <- unlist(strsplit(dup_accessions, split="\\|"))
dup_accessions_unique <- paste(unique(dup_accessions), collapse="|")
dup_species <- paste(a05_UNIQUEDB[a05_UNIQUEDB$species == a03_BESTNAMES$search_name[b], "duplicate_species"], collapse="|")
dup_species <- unlist(strsplit(dup_species, split="\\|"))
dup_species_unique <- unique(dup_species)
if (length(dup_accessions)>0){
a03_BESTNAMES$dup_accessions[b]<-dup_accessions_unique
a03_BESTNAMES$dup_species[b]<- paste(dup_species_unique, collapse="|")
a03_BESTNAMES$dup_species_n[b]<- length(dup_species_unique)
}
#reset variables
rm(dup_accessions)
rm(dup_species)
}
a03_BESTNAMES$n_all_seqs <- a03_BESTNAMES$n_mitogenome + a03_BESTNAMES$n_target
write.csv(a03_BESTNAMES, file.path(output_folder, "a03_BESTNAMES_v3.csv"), row.names=FALSE)
n_species <- dim(a03_BESTNAMES)[1]; n_species_target <- length(which(a03_BESTNAMES$n_all_seqs>0))
n_species; n_species_target
```