-
Notifications
You must be signed in to change notification settings - Fork 2
/
Get_NCBIFamiliesAndOrders.Rmd
134 lines (116 loc) · 5.19 KB
/
Get_NCBIFamiliesAndOrders.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
---
title: "Get_NCBIFamilies"
author: "Erin Grey"
date: "2023-07-11", "2022-09-19"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r housekeeping, include=FALSE}
rm(list=ls()) # clear environment
library("taxizedb") # load packages
```
```{r get_LocalTaxonomyDatabase, include=FALSE}
db_tax_NCBI <- db_download_ncbi(verbose = TRUE, overwrite = FALSE) #download the NCBI taxonomy database
```
```{r get_Families, include=FALSE}
families_NCBI_eukaryotes <- downstream("Eukarya", db = "ncbi", downto = "family") #get all family names in Domain Eukarya, note- output is a list
families_NCBI_eukaryotes_df <- as.data.frame(families_NCBI_eukaryotes[1]) #format from list to data frame
```
```{r get_FamilyTaxonomies, include=FALSE}
taxonomy_of_families <- taxizedb::classification(families_NCBI_eukaryotes_df[,1], db="ncbi") #get full taxonomies for each family, note - output is a list-like "classification" object that sucks
taxonomy_of_families_df <- data.frame(matrix(ncol=7, nrow=length(taxonomy_of_families))) # empty dataframe
colnames(taxonomy_of_families_df) <- c("full_branch","superkingdom","kingdom","phylum","class","order","family") #add column names to the empty dataframe
#loop through the classification object to populate the empty data.frame
for (i in 1:length(taxonomy_of_families)) {
x <- as.data.frame(taxonomy_of_families[i])
c1 <- paste0(x[,1], collapse=";")
c2 <- paste0(x[which(x[,2]=="superkingdom"),c(1,3)], collapse="_")
c3 <- paste0(x[which(x[,2]=="kingdom"),c(1,3)], collapse="_")
c4 <- paste0(x[which(x[,2]=="phylum"),c(1,3)], collapse="_")
c5 <- paste0(x[which(x[,2]=="class"),c(1,3)], collapse="_")
c6 <- paste0(x[which(x[,2]=="order"),c(1,3)], collapse="_")
c7 <- paste0(x[which(x[,2]=="family"),c(1,3)], collapse="_")
if (length(c1) > 0) {
taxonomy_of_families_df$full_branch[i] <- c1
} else {
taxonomy_of_families_df$full_branch[i] <- "na"
}
if (length(c2) > 0) {
taxonomy_of_families_df$superkingdom[i] <- c2
} else {
taxonomy_of_families_df$superkingdom[i] <- "na"
}
if (length(c3) > 0) {
taxonomy_of_families_df$kingdom[i] <- c3
} else {
taxonomy_of_families_df$kingdom[i] <- "na"
}
if (length(c4) > 0) {
taxonomy_of_families_df$phylum[i] <- c4
} else {
taxonomy_of_families_df$phylum[i] <- "na"
}
if (length(c5) > 0) {
taxonomy_of_families_df$class[i] <- c5
} else {
taxonomy_of_families_df$class[i] <- "na"
}
if (length(c6) > 0) {
taxonomy_of_families_df$order[i] <- c6
} else {
taxonomy_of_families_df$order[i] <- "na"
}
if (length(c7) > 0) {
taxonomy_of_families_df$family[i] <- c7
} else {
taxonomy_of_families_df$family[i] <- "na"
}
rm(list = c("c1","c2","c3","c4","c5","c6","c7","x"))
}
write.csv(taxonomy_of_families_df, "EukaryoteFamilies_NCBI_2023-09-19.csv")
```
```{r get_MetazoanSpeciesByOrder, include=FALSE}
taxonomy_of_orders_df <- taxonomy_of_families_df[!duplicated(taxonomy_of_families_df$order),1:6]
species_by_MetazoanOrder <- taxonomy_of_orders_df[which(taxonomy_of_orders_df$kingdom=="Metazoa_33208"),]
for (i in 1:dim(species_by_MetazoanOrder)[1]){
id <- strsplit(species_by_MetazoanOrder$order[i], "_")[[1]][2] #get the iud number for each order
spp_list <- downstream(id, db="ncbi", downto="species")
spp_df <- as.data.frame(spp_list[1])
species_by_MetazoanOrder$spp_list[i] <- paste0(spp_df[,1], collapse=";") #update database
rm(id); rm(spp_list); rm(spp_df) #clean up
}
##for undefined orders, go up to class to find species
for (i in 1:dim(species_by_MetazoanOrder)[1]){
if (species_by_MetazoanOrder$spp_list[i]=="NA") {
id <- strsplit(species_by_MetazoanOrder$class[i], "_")[[1]][2] #get the iud number for the class instead
spp_list <- downstream(id, db="ncbi", downto="species")
spp_df <- as.data.frame(spp_list[1])
species_by_MetazoanOrder$spp_list[i] <- paste0(spp_df[,1], collapse=";") #update database
rm(id); rm(spp_list); rm(spp_df) #clean up
}
}
write.csv(species_by_MetazoanOrder, "MetazoaSpeciesByOrder_2023-09-19.csv", row.names = F)
```
```{r get_MetazoanSpeciesByFamily, include=FALSE}
species_by_MetazoanFamily <- taxonomy_of_families_df[which(taxonomy_of_families_df$kingdom=="Metazoa_33208"),]
for (i in 1:dim(species_by_MetazoanFamily)[1]){
id <- strsplit(species_by_MetazoanFamily$family[i], "_")[[1]][2] #get the iud number for each order
spp_list <- downstream(id, db="ncbi", downto="species")
spp_df <- as.data.frame(spp_list[1])
species_by_MetazoanFamily$spp_list[i] <- paste0(spp_df[,1], collapse=";") #update database
rm(id); rm(spp_list); rm(spp_df) #clean up
}
##for undefined families, go up to order to find species
for (i in 1:dim(species_by_MetazoanFamily)[1]){
if (species_by_MetazoanFamily$spp_list[i]=="NA") {
id <- strsplit(species_by_MetazoanFamily$order[i], "_")[[1]][2] #get the iud number for the class instead
spp_list <- downstream(id, db="ncbi", downto="species")
spp_df <- as.data.frame(spp_list[1])
species_by_MetazoanFamily$spp_list[i] <- paste0(spp_df[,1], collapse=";") #update database
rm(id); rm(spp_list); rm(spp_df) #clean up
}
}
write.csv(species_by_MetazoanFamily, "MetazoaSpeciesByFamily_2023-09-19.csv", row.names = F)
```