-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path16S_DataCleaning.R
216 lines (164 loc) · 9.67 KB
/
16S_DataCleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#######Read in OTU data#######
library(phyloseq)
library(foreach)
library(doParallel)
library(data.table)
library(picante)
packageVersion("phyloseq")
#Read in biom file, with singletons removed
otufile16S <-import_biom("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/Niwot_King/Figures&Stats/kingdata/Bact/16S_ALL_97_OTU_filtsing.biom")
tax_table(otufile16S)
head(tax_table(otufile16S))
#Read in a file that lists taxonomic levels in the silva database, https://www.arb-silva.de/fileadmin/silva_databases/release_119/Exports/taxonomy/tax_slv_ssu_nr_119.txt
silvamap<-read.delim("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/Niwot_King/Figures&Stats/elitalus/tax_slv_ssu_nr_119.txt",header=F)
head(silvamap)
#silvamap has Bacteria;Actinobacteria; listed as a class, which is incorrect, it should be a phylum. changing this:
silvamap$V3[which(silvamap$V1=="Bacteria;Actinobacteria;")]<-"phylum"
#it also has "uncultured" listed as a class incorrectly many times, I should put is as a genus (this is only a problem for bacteria not eukarya). but I have to fix this later on
#Assign taxonomic levels to the tax file.
#First get tax_table in the write format with ; instead of _ between tax levels
tax_table16S2<-as.matrix(tax_table(otufile16S))
#tax_table2<-tax_table2[1:5] #trial on first five rows
dim(tax_table16S2)
tax_table16S3<-as.factor(tax_table16S2[,1])
levels(tax_table16S3)
tax_table16S4<-as.factor(gsub("_",";",tax_table16S3))
tax_table16S5<-as.factor(paste(tax_table16S4,rep(";",length(tax_table16S4)),sep=""))
#Then make a table with the names to search against the silva map file
head(silvamap)
head(tax_table16S5)
tax_table16S6<-matrix(0,nrow=length(tax_table16S5),ncol=11)#was 15 for euks
for (i in 1:length(tax_table16S5)){
for (j in 1:length(as.vector(gregexpr(";",tax_table16S5[i])[[1]]))){
second<-as.vector(gregexpr(";",tax_table16S5[i])[[1]])[j]
tax_table16S6[i,j]<-substr(tax_table16S5[i],1,second)
}
}
unique(tax_table16S6[,1]) #it only has a max of 11 columns
#the taxonomy is messed up, within one taxonmy line there are multiple items that match to "class" for example, so my original code will not work. Thus I will try to just pull out kingdom, order, and otu
#Then search against the silva map file# this takes a while, maybe 2 min
silvamapmatrix16S<-matrix(0,nrow=length(tax_table16S5),ncol=11)#was 15 for euks
for (i in 1:nrow(tax_table16S6)){
ind<-match(tax_table16S6[i,],silvamap$V1)
silvamapmatrix16S[i,]<-as.character(silvamap$V3[ind])
}
#only 5 instances of order being duplicated, and my code would actually treat them correctly
#below I fixed the class duplication (whcih was thousands)
#there are 5families that are wrong, and my code would do it incorrectly
ind<-which(rowSums(silvamapmatrix16S=="class",na.rm=T)>1)
silvamapmatrix16S[ind[2],]
tax_table16S6[ind[2],]
#Make file with just last name from the table6 file
tax_table16S7<-matrix(0,nrow=length(tax_table16S5),ncol=15)
for (i in 1:length(tax_table16S5)){
for (j in 1:length(as.vector(gregexpr(";",tax_table16S5[i])[[1]]))){
if (j==1){
first<-0
second<-as.vector(gregexpr(";",tax_table16S5[i])[[1]])[j]
} else {
first<-as.vector(gregexpr(";",tax_table16S5[i])[[1]])[j-1]
second<-as.vector(gregexpr(";",tax_table16S5[i])[[1]])[j]
}
tax_table16S7[i,j]<-substr(tax_table16S5[i],first+1,second-1)
}
}
head(tax_table16S7)
#fix the places where uncultured is a class, change it to genus
ind<-which(tax_table16S7[,1]=="Bacteria"&rowSums(tax_table16S7=="uncultured")==1)
ind
for(i in 1:length(ind)){
ind2<-which(tax_table16S7[ind[i],]=="uncultured")
silvamapmatrix16S[ind[i],ind2]<-"genus"
}
head(silvamapmatrix16S)
#Then order them correctly so that each level is put in the correct column
mytaxlevels<-c("domain","major_clade","superkingdom","kingdom","subkingdom","superphylum","phylum","subphylum","class","subclass","superorder","order","family","subfamily","genus")
mytaxmatrix16S<-matrix(0,nrow=length(tax_table16S5),ncol=15)
for(i in 1:nrow(tax_table16S7)){
ind<-match(silvamapmatrix16S[i,],mytaxlevels)
len<-length(as.vector(gregexpr(";",tax_table16S5[i])[[1]]))
ind2<-which(is.na(ind)==F)
ind3<-ind[ind2]
mytaxmatrix16S[i,ind3]<-tax_table16S7[i,][ind2]
}
colnames(mytaxmatrix16S)<-mytaxlevels
rownames(mytaxmatrix16S)<-rownames(tax_table16S2)
head(mytaxmatrix16S)
mytaxmatrix16S[which(mytaxmatrix16S=="0")]<-NA
#mytaxtable<-as.data.frame(mytaxmatrix)
#class(mytaxmatrix)<-class(tax_table2)
#head(mytaxmatrix)
#mytaxmatrix[10:50,]
#Replace the new tax matrix in the otu file
tax_table(otufile16S)<-mytaxmatrix16S
#Import mapping and tree file
map16S<-import_qiime_sample_data("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/Niwot_King/Figures&Stats/kingdata/MappingFiles/515BC_Niwot_20072015_All_MapFile.txt")
dat16S<-merge_phyloseq(otufile16S,map16S)
#XXXXXXXXX add tree file when fastree stops
#This has not yet been integrated into dat
treefile16S <- read_tree("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/Niwot_King/Figures&Stats/kingdata/Bact/16S_ALL_truncate_97_sinaaln_rep_set.tre")
#plot(treefile) #takes a long time
dat16Stree<-merge_phyloseq(otufile16S,map16S,treefile16S) #takes a long time, maybe 1 hr. I lose 62,874 taxa (about half of the taxa) when I merge the treefile, I assume this is because some samples could not be aligned
######Soil data from 2015######
dat16Ss<-subset_samples(dat16S, SampleType=="soil"&year=="2015")
dat16Sst<-subset_samples(dat16Stree, SampleType=="soil"&year=="2015")
#take euks out
dat16Ss2<-subset_taxa(dat16Ss,domain%in%c("Archaea","Bacteria")&family!="mitochondria"&class!="Chloroplast")
dat16Ss2t<-subset_taxa(dat16Sst,domain%in%c("Archaea","Bacteria")&family!="mitochondria"&class!="Chloroplast")
#for tree data, take out samples 34, 5, 126 because they had less than 30 total reads
dat16Ss2ta = prune_samples(names(which(sample_sums(dat16Ss2t) >= 200)), dat16Ss2t)
dat16Ss2tar<-rarefy_even_depth(dat16Ss2ta,sample.size=min(sample_sums(dat16Ss2ta)),rngseed=10,replace=F)#rarefy to 3026, takes a few minutes
pd16S<-pd(as.matrix(as.data.frame(t(otu_table(dat16Ss2tar)))),phy_tree(dat16Ss2tar),include.root=TRUE) #takes a few hours
#Renaming and organizing orders and kingdoms here, then replace the tax table in dats2 to include these new groupings
dat16Ss2tax<-as.data.frame(tax_table(dat16Ss2))
dim(dat16Ss2tax)
#all samples have a domain and a phylum and a class (although one level is "unknown class"). no kingdoms and there are NAs for order
#I'm going to skip the labels for order right now
sort(unique(dat16Ss2tax$phylum))
#photosynthetic phyla are Chloroflexi and Cyanobacteria
photbac<-ifelse(tax_table(dat16Ss2)[,"phylum"]%in%c("Chloroflexi","Cyanobacteria"),"PhotosyntheticBacteria","NonphotosyntheticBacteria")
bacterialabels<-data.frame(cbind(otu=rownames(tax_table(dat16Ss2)),orders=tax_table(dat16Ss2)[,"phylum"],kingdomlables=photbac));colnames(bacterialabels)<-c("otu","orders","kingdomlabels")
head(bacterialabels)
#take out samples 34, 5, 126 because they had less than 30 total reads
dat16Ss2a = prune_samples(names(which(sample_sums(dat16Ss2) >= 200)), dat16Ss2)
dat16Ss2a_s<-sample_data(dat16Ss2a)
dat16Ss2a_o<-t(otu_table(dat16Ss2a))
length(which(colSums(dat16Ss2a_o)>0))#remove zeros, count total number of OTUs
sum(dat16Ss2a_o)
#this file is too large to use cbind. online it was suggested to use data.table to store and manipulate large datasets, however it has a whole different syntax than data.frames
#dat16Ss2b<-cbind(dat16Ss2a_s,dat16Ss2a_ot)
#dat16Ss2a_st<-data.table(dat16Ss2a_s,keep.rownames = T)
#dat16Ss2otu<-cbind(sample_data(dat16Ss2a),t(otu_table(dat16Ss2a)))#takes too long
#dat16Ss2a, dat16Ss2a_s, dat16Ss2a_o is the final data file for diversity analyses. the only thing removed are plants/chloroplasts/mitochondria and otus with only one read (which is likely error). also removed are the three samples which didn't amplify
#Then remove more rare taxa which will create a datset for network analysis
#Beginning of code for order groups
#Remove doubletons and singletons, taxa present in only 1 or 2 samples
wh0 = genefilter_sample(dat16Ss2, filterfun_sample(function(x) x > 0), A = 3)
dat16Ss3 = prune_taxa(wh0, dat16Ss2)
#####Group things by genus / otu#####
#Third try, trying to be consistent with sample size minimum in the network analysis (my cutoff there is 3 or 4 or 5). Follwing Widder et al 2014 PNAS
wh0 = genefilter_sample(dat16Ss2, filterfun_sample(function(x) x > 0), A = 3)#30635 taxa
dat16Ss10a = prune_taxa(wh0, dat16Ss2)
dat16Ss10a
#transform to relative abundance
dat16Ss10b = transform_sample_counts(dat16Ss10a, function(x) x/sum(x) )
wh0 = filter_taxa(dat16Ss10b, function(x) sum(x) > (0.002), prune=F)
dat16Ss10c = prune_taxa(wh0, dat16Ss10a) #3019
dat16Ss10c
sort(sample_sums(dat16Ss10c)) #need to take out samples 34, 5, 126 because they had less than 30 total reads and need to rarefy the rest to 4896 reads
dat16Ss10 = prune_samples(names(which(sample_sums(dat16Ss10c) >= 200)), dat16Ss10c)
dat16Ss10otu<-cbind(sample_data(dat16Ss10),t(otu_table(dat16Ss10)))
head(dat16Ss10otu)[,1:40]
dim(dat16Ss10otu)
dat16Ss10r<-rarefy_even_depth(dat16Ss10,sample.size=min(sample_sums(dat16Ss10)),rngseed=10,replace=F)#rarefy dats10 for betadiversity metrics, 12 OTUS were removed because thye were no longer present in any sample after rarefaction
#nice, this is the same exact rarefaction as the comm.data
#head(comm.data)[,28:50]
#head(t(otu_table(dats10r)))[,1:30]
######Grouping by phylum#####
dat16Ss7<-transform_sample_counts(dat16Ss2, function(x) 100*x/sum(x) )
dat16Ss8<-otu_table(dat16Ss7)
dat16Ss2tax[,"phylum"]
dat16Ss9<-aggregate.data.frame(dat16Ss8,by=list(kingdomlabels=dat16Ss2tax[,"phylum"]),sum)
rownames(dat16Ss9)<-dat16Ss9$kingdomlabels
dat16Ss9$kingdomlabels<-NULL
dat16Ss9kingdom<-cbind(sample_data(dat16Ss2),t(dat16Ss9))