-
Notifications
You must be signed in to change notification settings - Fork 0
/
nifH_analysis.Rmd
445 lines (333 loc) · 18.7 KB
/
nifH_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
---
title: "16s_dada2"
author: "SHAIL"
output: html_document
---
# 1.0 Load packages**
```{r, results='hide', message=FALSE}
library("dada2")
library("Biostrings")
library("ggplot2")
library("kableExtra")
library("tidyverse")
#Set seed
sessionInfo()
set.seed(0168)
```
## 2.0 Initial setup**
#Setup directories,fastq files, identify triming parameters
# 2.1 Assign directories
```{r}
fastq_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/fastq/" # fastq directory
filtN_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/filtN/" #Filter Ns
cutadapt_trimmed_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/cutadapt_trimmed/"
filtered_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/fastq_filtered/" # fastq filtered
qual_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/qual_pdf/" # qual pdf
dada2_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/dada2/" # dada2 results
database_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/databases/" # databases
blast_dir <- "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/blast/" #blast
```
## 2.1.1 Create directory (skip if already created)**
```{r}
dir.create(fastq_dir)
dir.create(filtN_dir)
dir.create(cutadapt_trimmed_dir)
dir.create(filtered_dir)
dir.create(qual_dir)
dir.create(dada2_dir)
dir.create(database_dir)
dir.create(blast_dir)
```
# (Manually load your fastq files in fastq directory. check for names and extenision and change accordingly in the next step)
## 2.2 Examine the fastQ files**
# It is assumed that the sample names are at the start of file name and separated by _ e.g. xxxx_R1.fq.
# To get a list of all fastq files and separate R1 and R2
```{r}
fns <- sort(list.files(fastq_dir, full.names = TRUE))
fns <- fns[str_detect(basename(fns), ".fastq.gz")]
fns_R1 <- fns[str_detect(basename(fns), "R1")]
fns_R2 <- fns[str_detect(basename(fns), "R2")]
```
# 2.2.1 Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fq**
```{r}
sample.names <- str_split(basename(fns_R1), pattern = "_", simplify = TRUE)
sample.names <- sample.names[,1]
```
# 2.2.2 Compute number of paired reads
# create an empty data frame
```{r}
df <- data.frame()
```
# loop through all the R1 files (no need to go through R2 which should be
# the same)
```{r}
for (i in 1:length(fns_R1)) {
# use the dada2 function fastq.geometry
geom <- fastq.geometry(fns_R1[i])
# extract the information on number of sequences and file name
df_one_row <- data.frame(n_seq = geom[1], file_name = basename(fns[i]))
# add one line to data frame
df <- bind_rows(df, df_one_row)
}
# display number of sequences and write data to small file
kable(df)%>%
kable_styling(bootstrap_options = c("striped", "hover"))
```
# 2.3 Identify primers (change primer sequence according to your sequencing)**
```{r}
FWD <- "TGCGAYCCSAARGCBGACTC" ## POLF
REV <- "ATSGCCATCATYTCRCCGGA" ## POLR
```
# To ensure we have the right primers, and the correct orientation of the primers on the reads,
# we will verify the presence and orientation of these primers in the data.
```{r}
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
```
## 3.0 Primer trimming**
#The presence of ambiguous bases (Ns) in the sequencing reads makes accurate mapping of short primer sequences difficult.
#Next we are going to “pre-filter” the sequences just to remove those with Ns, but perform no other filtering.
#multithread = FALSE for windows OS.
```{r}
fns_R1.filtN <- str_c(filtN_dir, sample.names, ".R1_filtN.fq.gz") # Put N-filterd files in filtN/ subdirectory
fns_R2.filtN <- str_c(filtN_dir, sample.names, ".R2_filtN.fq.gz")
```
```{r}
out <- filterAndTrim(fns_R1, fns_R1.filtN, fns_R2, fns_R2.filtN, maxN = 0, multithread = TRUE)
```
# To count the number of times the primers appear in the forward and reverse read, while considering all possible primer orientations.
# Identifying and counting the primers on one set of paired end FASTQ files is sufficient,
# assuming all the files were created using the same library preparation, so we’ll just process the first sample.
```{r}
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fns_R1.filtN [[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fns_R2.filtN [[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fns_R1.filtN [[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fns_R2.filtN [[1]]))
```
# The output will show if any of the primers present in forward and reverse reads. If yes, then do primer trimming either by cutadapt or DADA2's inbuilt trimmer (Cutadapt is preffered to accurately trim all primer sequences).
## 3.1 Primer trimming using Cutadapt**
#Install cutadapat if you don’t have it already. After installing cutadapt, we need to tell R the path to the cutadapt command.
```{r}
cutadapt <- "/usr/local/bin/cutadapt" # CHANGE ME to the cutadapt path on your machine
system2(cutadapt, args = "--version") # Run shell commands from R
```
#If the above command succesfully executed it will show cutadapt version, R has found cutadapt and you are ready to for triiming..
#We now create output filenames for the cutadapt-ed files, and define the parameters we are going to give the cutadapt command.
#The critical parameters are the primers, and they need to be in the right orientation, i.e. the FWD primer should have been matching the forward-reads in its forward orientation, and the REV primer should have been matching the reverse-reads in its forward orientation.
```{r}
path.cut <- (cutadapt_trimmed_dir)
fns_R1.cut <- file.path(path.cut, basename(fns_R1))
fns_R2.cut <- file.path(path.cut, basename(fns_R2))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC)
# Run Cutadapt
for(i in seq_along(fns_R1)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fns_R1.cut[i], "-p", fns_R2.cut[i], # output files
fns_R1.filtN[i], fns_R2.filtN[i])) # input files
}
```
#check again if the primers have been trimmed off or not
```{r}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fns_R1.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fns_R2.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fns_R1.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fns_R2.cut[[1]]))
```
#The primer-free sequence files are now ready to be analyzed through the DADA2 pipeline. Similar to the earlier steps of reading in FASTQ files, we read in the names of the cutadapt-ed FQ files and applying some string manipulation to get the matched lists of forward and reverse fastq files.
# Forward and reverse fq filenames have the format:
```{r}
fns_cut <- sort(list.files(cutadapt_trimmed_dir, full.names = TRUE))
fns_cut <- fns[str_detect(basename(fns), "fq")]
cut_R1 <- fns[str_detect(basename(fns), "R1")]
cut_R2 <- fns[str_detect(basename(fns), "R2")]
```
# Extract sample names, assuming filenames have format:
```{r}
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cut_R1, get.sample.name))
head(sample.names)
```
# 2.4 Plot quality for reads**
```{r}
for (i in 1:length(fns_cut)) {
# Use dada2 function to plot quality
p1 <- plotQualityProfile(fns_cut[i])
# Only plot on screen for first 2 files
if (i <= 2) {
print(p1)
}
# save the file as a pdf file (uncomment to execute)
p1_file <- paste0(qual_dir, basename(fns[i]), ".qual.pdf")
ggsave(plot = p1, filename = p1_file, device = "pdf", width = 15, height = 15,
scale = 1, units = "cm")
}
```
# The quality profile plot is a gray-scale heatmap of the frequency of each quality score at each base position.
# The median quality score at each position is shown by the green line,and the quartiles of the quality score distribution by the orange lines. X-AXIS shows reads length and Y-AXIS shows quality score. Check the point at which the quality of the grpah detoriates (generally,less than 30 q score), this will be the truncation length in next step. check for both R1 and R2 files.
## 3.0 Filter and Trim the reads**
# 3.1 Place filtered files in filtered/ subdirectory**
```{r}
filt_R1 <- file.path(filtered_dir, paste0(sample.names, "R1_filt.fq.gz"))
filt_R2 <- file.path(filtered_dir, paste0(sample.names, "R2_filt.fq.gz"))
names(filt_R1) <- sample.names
names(filt_R2) <- sample.names
```
#Set trunclen=c ()according to quality profiles from above graph, trimleft=c() is only for primer removal, change value according to your primer length(nt), or remove the group if primer is already removed from the fq files.Other parameters can be kept default (unless you get abnormal results).
#Be careful while setting trunclength, Consider your sequenced amplicon length. The summed up truncation length of R1 and R2 should should be exect or a bit more than expected amplicon length.
```{r}
out <- filterAndTrim(cut_R1, filt_R1, cut_R2, filt_R2,truncLen=c(210,190),
maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
compress=TRUE,multithread=TRUE)
head(out)
```
#The output will show the number of reads in and reads out. Check for reads out (filtered reads), if the number is very less (less than 50%), then ypu need to r-visit the fiter and trim parametrs. For a normal sequence a loss of 20-40% is acceptable. Finally, it depends on the quality of sequnced files and filter and trim parameters applied.
## 4.0 Dada2 processing**
# 4.1 learn error rates**
```{r}
err_R1 <- learnErrors(filt_R1, multithread = TRUE)
plotErrors(err_R1, nominalQ = TRUE)
```
```{r}
err_R2 <- learnErrors(filt_R2, multithread = TRUE)
plotErrors(err_R2, nominalQ = TRUE)
```
# Check for plots Red line=expected error rate according to nominal definition of the Q-score, Black line=expected error rate from machine learning, black dotes= observed error rates. Check whetherthe black line reasonably fit the observations (black points)? it will not be perfect fit, a good fit is what we look. Secondly, check if the error rate decreses with increase in quality score. If both clauses are passed, we are ready to move forward.
# If you get Warning message:Transformation introduced infinite values in continuous y-axis This isn't an error, just a message from the plotting function to let you know that there were some zero values in the data plotted (which turn into infinities on the log-scale).That is completely expected.
# 4.2 Sample Inference**
# The core algorithm that will identify the real variants. Pooling is a method where in sample information (ASV's) from one sample is shared with all other samples so, as to get bettr finer results. Especially, works best with samples having high microbial density with many rare ASV's. Can use POOL=FALSE, pool=TRUE, pool=pseudo Try with pool=TRUE first.You may get better results but will take longer time and RAM. If get memory error go for pool=pseudo. IF still get memory error Than either use a high performing computer or use POOL=FALSE
# Now, using our trimmed sequence and machine learnt error rate, we will find sequence variants.
# First with forward (R1) sequences.
```{r}
dada_R1 <- dada(filt_R1, err=err_R1, multithread=TRUE,pool = TRUE)
```
# Then, with reverse (R2) sequences.
```{r}
dada_R2 <- dada(filt_R2, err = err_R2, multithread = TRUE, pool = TRUE)
```
# Inspecting the returned dada-class object:denoising results
```{r}
dada_R1[[1]]
```
```{r}
dada_R2[[1]]
```
# 4.3 Merge paired reads**
# We now merge the forward and reverse reads together to obtain the full denoised sequences.
# By default, merged sequences are only output if the forward and reverse reads overlap by at least 12 bases, and are identical to each other in the overlap region.
```{r}
mergers <- mergePairs(dada_R1, filt_R1, dada_R2, filt_R2, verbose=TRUE)
```
# 4.4 Build Sequence table (ASV's)**
# construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.
```{r}
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
```
# OPTIONAL:Sequences that are much longer or shorter than expected may be the result of non-specific priming.You can remove non-target-length sequences from your sequence table by-
#(240, 380) is the range of targeted amplicon length. Change according to your amlicon length.
```{r}
seqtab <- seqtab[, nchar(colnames(seqtab)) %in% seq(240, 380)]
```
# 4.5 Remove chimeras
# Note that remove chimeras will produce spurious results if primers have not be removed.The parameter methods can be "pooled" (if samples were pooled in sample inference step) or "consensus".“consensus” does de novo identification in each sample, takes a vote across samples, and removes all ASVs identified as chimeras in a high enough fraction of the samples in which they were present. “pooled” just lumps all ASVs in the data into one big sample, and identifies and removes chimeras. “consensus” performs better for typical datasets/workflows.
#note- first run "seqtab.nochim" with minFoldParentOverAbundance = 1 (default), see the percent of non chimeras. if a lot of reads are removed as chimeras and you are sure that your reads completely cleaned of primers then try increasing minFoldParentOverAbundance = 2,4,6, or 8. Finally, if you are not satisfied with the results, you can always, start again making sure to use cleaned, quality reads.
```{r}
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE, minFoldParentOverAbundance = 2)
```
#Inspect distribution of sequence lengths
```{r}
table(nchar(getSequences(seqtab.nochim)))
```
# The sequence table is a matrix with rows corresponding to (and named by) the samples,and columns corresponding to (and named by) the sequence variants. The lengths of our merged sequences Should all fall within the expected range (length) of our source amlicon.
```{r}
table(nchar(getSequences(seqtab.nochim)))
```
# To check Percent of non-chimeras
```{r}
paste0("% of non chimeras : ", sum(seqtab.nochim)/sum(seqtab2) * 100)
```
# To check total number of sequences
```{r}
paste0("total number of sequences : ", sum(seqtab.nochim))
```
# 4.6 Track number of reads at each step
```{r}
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dada_R1, getN), sapply(dada_R2, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
```
#Get tabled output
```{r}
kable(track) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
```
# Apart from filtering (depending on how stringent you want to be) there should no step in which a majority of reads are lost. If a majority of reads were removed as chimeric, you may need to revisit the removal of primers, as the ambiguous nucleotides in unremoved primers interfere with chimera identification. If a majority of reads failed to merge, the culprit could also be unremoved primers, but could also be due to biological length variation in the sequenced ITS region that sometimes extends beyond the total read length resulting in no overlap.
#4.6.1 Save table to a file
```{r}
tf<- data.frame(track)
write.table(tf, "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/dada2/read_numbers_dada2.tsv", sep="\t", quote=F, col.names=NA)
```
#For taxonomy we will play around to get right balance, as the default method of assigning taxonomy to nif-h genes does not work well. Here we will first use the Moyn413 nif-h database and minboot 0. then manually apply minboot 70 to kingdom to strip off all sequences whose boot value is less then 70 and minboot 50 to class.
# ASSIGN TAXONOMY
```{r}
db_file <- paste0(database_dir, "nifH_dada2_all_v1.1.0.fasta")
taxanifh <- assignTaxonomy(seqtab.nochim, refFasta = db_file, minBoot=0, outputBootstraps = TRUE, multithread=TRUE)
```
#Rename sequences to asv number and convert counts to a dataframe
```{r}
seqtab_1 <- as.data.frame(t(seqtab.nochim)) %>% rownames_to_column(var = "sequence") %>%
rowid_to_column(var = "ASVNumber") %>% mutate(ASVNumber = sprintf("asv%04d",
ASVNumber)) %>% mutate(sequence = str_replace_all(sequence, "(-|\\.)", ""))
```
#Append taxonomy and boot to the sequence table
```{r}
taxa_tax <- as.data.frame(taxanifh$tax)
taxa_boot <- as.data.frame(taxanifh$boot) %>% rename_all(funs(str_c(., "_boot")))
seqtab.nochim_nifh <- taxa_tax %>% bind_cols(taxa_boot) %>% bind_cols(seqtab_1)
```
#Lets apply bootmin to kingdom and then to class
```{r}
bootstrap_mink <- 70
# Filter based on the bootstrap
seqtab.nochim_boot80 <- seqtab.nochim_nifh %>% dplyr::filter(Kingdom_boot >=
bootstrap_mink)
bootstrap_minc <- 50
seqtab.nochim_boot50 <- seqtab.nochim_boot80 %>% dplyr::filter(Class_boot >=
bootstrap_minc)
```
#4.8 Save results from DADA2
#count table
```{r}
seqdf50 <- seqtab.nochim_boot50
asvtab50 <- seqdf50[,-c(1,2,3,4,5,6,7,8,9,10,11,12,14)]
write.table(asvtab50, "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/dada2/nifdb/ASVs_table_nifdb50.tsv", sep="\t", quote=F, col.names=NA)
```
# tax table
```{r}
taxtab50 <- seqdf50[,-c(7,8,9,10,11,12,14,15,16,17,18,19)]
write.table(taxtab50, "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/dada2/nifdb/tax_table_nifdb50.tsv", sep="\t", quote=F, col.names=NA)
```
#Save whole table (without class bootstrap filtering) for manual filtering
```{r}
write.table(seqtab.nochim_boot80, "~/Documents/amplicon_seq_DADA2/transcript_nifH-amplicon/dada2/nifdb/nihdb_asv_taxa_kingdom_80boot.tsv", sep="\t", quote=F, col.names=NA)
```