-
Notifications
You must be signed in to change notification settings - Fork 104
/
pipeline_qiime2 en.sh
317 lines (256 loc) · 15.8 KB
/
pipeline_qiime2 en.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
[TOC]
# QIIME2 2023.2 Analysis Process
## 0. Software Installation (Appendix 1)
# Only compatible with Linux/Mac systems,
# For Windows users, use the built-in Linux Subsystem (supports right-click paste) or install and run in a Linux server environment
# Refer to the official website for detailed instructions,https://docs.qiime2.org/2024.5/,Or the QIIME2 tutorial collection https://mp.weixin.qq.com/s/farGisfX3fVL_5WgXS8lIg
# Install Windows Subsystem for Linux,https://mp.weixin.qq.com/s/0PfA0bqdvrEbo62zPVq4kQ
## 1. Preparatory Work
# Set up the working directory, for example, on the server it would be ~/amplicon/qiime2, and for the Windows Subsystem:
wd=/mnt/c/EasyAmplicon/qiime2/
# Navigate to the Working Directory
mkdir -p ${wd}
cd ${wd}
# Activate the QIIME2 working environment. For older versions of conda, use 'source' instead of 'conda activate
conda activate qiime2-amplicon-2024.5
# Prepare sample metadata metadata.txt、Raw data seq/*.fq.gz
## The code provided here is based on the metadata.txt file to download sequencing data from public sources. It downloads data according to GSA's CRA (batch) and CRR (sample) identifiers
mkdir -p seq
# # downloadbload from the public Network
# cd seq
# awk '{system("wget -c ftp://download.big.ac.cn/gsa/"$5"/"$6"/"$6"_f1.fq.gz -O "$1"_1.fq.gz")}' \
# <(tail -n+2 ../metadata.txt)
# awk '{system("wget -c ftp://download.big.ac.cn/gsa/"$5"/"$6"/"$6"_r2.fq.gz -O "$1"_2.fq.gz")}' \
# <(tail -n+2 ../metadata.txt)
# cd .. && ls -lsh seq
# Link from another location (without taking up additional space)
ln /mnt/c/EasyAmplicon/seq/* seq/
ln /mnt/c/EasyAmplicon/result/metadata.txt ./
# Generate a manifest file based on the metadata
awk 'NR==1{print "sample-id\tforward-absolute-filepath\treverse-absolute-filepath"} \
NR>1{print $1"\t$PWD/seq/"$1"_1.fq.gz\t$PWD/seq/"$1"_2.fq.gz"}' \
metadata.txt > manifest
head -n3 manifest
# Import data into QIIME2 in paired-end 33 format
qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path manifest \
--output-path demux.qza \
--input-format PairedEndFastqManifestPhred33V2
# The fq file takes 7M for 1G, the fq.gz compression format is only 34s, and the test data is 9s
## 2. Generate feature tables and representative sequences
### Method 1: DADA2 (Slow, prone to errors due to multiple dependencies on R and Python packages)
# Support multi-thread acceleration, 900,000 PE250 data,0/96p, 34m;24p, 44m;8p, 77m;1p, 462m
# 270,000, 8p, 9m; 4p, 11m;
time qiime dada2 denoise-paired \
--i-demultiplexed-seqs demux.qza \
--p-n-threads 4 \
--p-trim-left-f 29 --p-trim-left-r 18 \
--p-trunc-len-f 0 --p-trunc-len-r 0 \
--o-table dada2-table.qza \
--o-representative-sequences dada2-rep-seqs.qza \
--o-denoising-stats denoising-stats.qza
# Identify the results using DADA2 and import them into the main workflow
cp dada2-table.qza table.qza
cp dada2-rep-seqs.qza rep-seqs.qza
### Method2. External import of feature table and representative sequences (commonly used)
# Upload the generated OTU table (otutab.txt) and representative sequences (otus.fa)
# Convert the text file to Biom 1.0 format, keeping in mind that biom --version 2.1.5/8 works, but version 2.1.7 causes errors
biom convert -i otutab.txt -o otutab.biom \
--table-type="OTU table" --to-json
# Importing the feature table took 9 seconds
qiime tools import --input-path otutab.biom \
--type 'FeatureTable[Frequency]' --input-format BIOMV100Format \
--output-path table.qza
# Importing the representative sequences took 8 seconds
qiime tools import --input-path otus.fa \
--type 'FeatureData[Sequence]' \
--output-path rep-seqs.qza
### Statistics for the feature table and representative sequences
qiime feature-table summarize \
--i-table table.qza \
--o-visualization table.qzv \
--m-sample-metadata-file metadata.txt
qiime feature-table tabulate-seqs \
--i-data rep-seqs.qza \
--o-visualization rep-seqs.qzv
# Download the QZV file for online viewing. DADA2 resulted in just over 1,000 ASVs
## 3. Alpha and beta diversity analysis
### Build a phylogenetic tree for diversity analysis, took 53s
qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences rep-seqs.qza \
--o-alignment aligned-rep-seqs.qza \
--o-masked-alignment masked-aligned-rep-seqs.qza \
--o-tree unrooted-tree.qza \
--o-rooted-tree rooted-tree.qza
### Calculate core diversity
# 13s,Sampling depth is typically chosen as the minimum value, as indicated by the "table.qzv" file.
qiime diversity core-metrics-phylogenetic \
--i-phylogeny rooted-tree.qza \
--i-table table.qza \
--p-sampling-depth 7439 \
--m-metadata-file metadata.txt \
--output-dir core-metrics-results
### Significance analysis and visualization of inter-group alpha diversity
# 7s, Optional alpha indices include faith_pd、shannon、observed_features、evenness
index=observed_features
qiime diversity alpha-group-significance \
--i-alpha-diversity core-metrics-results/${index}_vector.qza \
--m-metadata-file metadata.txt \
--o-visualization core-metrics-results/${index}-group-significance.qzv
### Alpha diversity rarefaction curve
# 25s, Choose the maximum value for max-depth, derived from table.qzv
qiime diversity alpha-rarefaction \
--i-table table.qza \
--i-phylogeny rooted-tree.qza \
--p-max-depth 10298 \
--m-metadata-file metadata.txt \
--o-visualization alpha-rarefaction.qzv
# The available indices for the results are observed_otus, shannon, and faith_pd
### Significance of analysis and visualization of inter-group beta diversity
# Optional beta diversity indices include unweighted_unifrac、bray_curtis、weighted_unifrac and jaccard
# 7s, Specifying groups reduces computational load; permutation tests are more time-consuming
distance=weighted_unifrac
column=Group
qiime diversity beta-group-significance \
--i-distance-matrix core-metrics-results/${distance}_distance_matrix.qza \
--m-metadata-file metadata.txt \
--m-metadata-column ${column} \
--o-visualization core-metrics-results/${distance}-${column}-significance.qzv \
--p-pairwise
## 4. Species composition analysis
# Species annotation using databases as listed in the appendix. You can refer to these databases,silva-138-99-nb-classifier.qza or 2022.10.backbone.full-length.nb.qza
# 1m Optional specific primer training sets, such as classifier_gg_13_8_99_V5-V7.qza, are trained with V5-V7 regions. Refer to the appendix or official tutorial for more details
time qiime feature-classifier classify-sklearn \
--i-classifier 2022.10.backbone.full-length.nb.qza \
--i-reads rep-seqs.qza \
--o-classification taxonomy.qza
# Visualize species annotations
qiime metadata tabulate \
--m-input-file taxonomy.qza \
--o-visualization taxonomy.qzv
# Display data with stacked bar charts
qiime taxa barplot \
--i-table table.qza \
--i-taxonomy taxonomy.qza \
--m-metadata-file metadata.txt \
--o-visualization taxa-bar-plots.qzv
## 5. Differential analysis using ancom
# Format the feature table and add pseudo-counts,4s
qiime composition add-pseudocount \
--i-table table.qza \
--o-composition-table comp-table.qza
# Calculate differential features, specifying the type of grouping for comparison,1m
column=Group
time qiime composition ancom \
--i-table comp-table.qza \
--m-metadata-file metadata.txt \
--m-metadata-column ${column} \
--o-visualization ancom-${column}.qzv
# Merge by genus level and perform statistics
## Merge at the genus level,6s
qiime taxa collapse \
--i-table table.qza \
--i-taxonomy taxonomy.qza \
--p-level 6 \
--o-collapsed-table table-l6.qza
# Format the feature table and add pseudo-counts,6s
qiime composition add-pseudocount \
--i-table table-l6.qza \
--o-composition-table comp-table-l6.qza
# Calculate differential genera, specifying the type of grouping for comparison,16s
qiime composition ancom \
--i-table comp-table-l6.qza \
--m-metadata-file metadata.txt \
--m-metadata-column ${column} \
--o-visualization ancom-l6-${column}.qzv
# Appendix
## 1. To install qiime2-amplicon-2024.5
### Install Conda
# Download,Install, and initialize conda
wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -b -f
~/miniconda3/condabin/conda init
# Close the terminal and reopen it.
## Online installation of QIIME using Conda.
# After installing Miniconda and opening a new terminal, make sure you’re running the latest version of conda (Optional but recomended):
#conda update conda
# Set the environment name and version
n=qiime2-amplicon-2024.5
#Attached is the online installation of the software and packaging code
# Download the environment file
wget -c https://data.qiime2.org/distro/amplicon/${n}-py39-linux-conda.yml
# Alternative Link: wget -c http://www.imeta.science/db/conda/${n}-py39-linux-conda.yml
#Create the Conda environment
# Install in a new environment, and once successfully installed on different computer servers, package and distribute.
conda env create -n ${n} --file ${n}-py39-linux-conda.yml
# Environment packaging(Optional,1.2G)
conda pack -n ${n} -o ${n}.tar.gz
## 2. Species annotation data training Set
### Silva 138 99% OTUs full-length sequences
# Download from the Official Website
wget -c https://data.qiime2.org/2023.2/common/silva-138-99-nb-classifier.qza
# Alternative Link
wget -c ftp://download.nmdc.cn/tools/amplicon/silva/silva-138-99-nb-classifier.qza
### Greengenes2 2022.10 full length sequences
# Download from the Official Website
wget -c http://ftp.microbio.me/greengenes_release/2022.10/2022.10.backbone.full-length.nb.qza
# Alternative Link
wget -c ftp://download.nmdc.cn/tools/amplicon/GreenGenes/2022.10.backbone.full-length.nb.qza
## 3. Species Annotation data training set
wd=/mnt/c/EasyAmplicon/qiime2
mkdir -p $wd
cd $wd
# Download the Database file(greengenes, 320M)
# wget -c ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz
# Domestic Alternate Links
# wget -c ftp://download.nmdc.cn/tools/amplicon/GreenGenes/gg_13_8_otus.tar.gz
# Backup of Core 99 Database within the country (60MB).
wget -c ftp://download.nmdc.cn/tools/amplicon/GreenGenes/gg_13_8_otus_99.tar.gz
mv gg_13_8_otus_99.tar.gz gg_13_8_otus.tar.gz
# unpack
tar -zxvf gg_13_8_otus.tar.gz
# Use the 99_otus.fasta data in the rep_set file and the 99_OTU_taxonomy.txt data in taxonomy as reference species annotations
# Import reference sequence, 50s
qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path gg_13_8_otus/rep_set/99_otus.fasta \
--output-path 99_otus.qza
# Fontconfig error: Cannot load default config file Does not affect the results
# Importing species taxonomy informationtook 6s
qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-format HeaderlessTSVTaxonomyFormat \
--input-path gg_13_8_otus/taxonomy/99_otu_taxonomy.txt \
--output-path ref-taxonomy.qza
# Train the classifier——full-length,took 30m
time qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads 99_otus.qza \
--i-reference-taxonomy ref-taxonomy.qza \
--o-classifier classifier_gg_13_8_99.qza
# Primer extraction of amplified segments of the reference sequence Extract reference reads
# Commonly use Greengenes 13_8 99% OTUs from 341F CCTACGGGNGGCWGCAG/805R GACTACHVGGGTATCTAATCC region of sequences(分类器描述),提供测序的引物序列,截取对应的区域进行比对,达到分类的目的。
# This time used Primer 799F-1193R, Please replace according to the actual situation, 8m
time qiime feature-classifier extract-reads \
--i-sequences 99_otus.qza \
--p-f-primer AACMGGATTAGATACCCKG \
--p-r-primer ACGTCATCCCCACCTTCC \
--o-reads ref-seqs.qza
# Train the classifier
# Based on the filtered specified segments, an experiment-specific classifier is generated,took 7 min
time qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads ref-seqs.qza \
--i-reference-taxonomy ref-taxonomy.qza \
--o-classifier classifier_gg_13_8_99_V5-V7.qza
# Common issue1:The scikit-learn version is not compatible, simply refit the classifier using Naive Bayes to build the training set will resolve the issue.
Plugin error from feature-classifier:
The scikit-learn version (0.21.2) used to generate this artifact does not match the current version of scikit-learn installed (0.22.1). Please retrain your classifier for your current deployment to prevent data-corruption errors.
Debug info has been saved to /tmp/qiime2-q2cli-err-5ngzk2hm.log
# Common issue2: Error while running dada2 due to incomplete environment configuration. Run 'conda unpack' for initialization.
Plugin error from dada2:
An error was encountered while running DADA2 in R (return code 255), please inspect stdout and stderr to learn more.
Debug info has been saved to /tmp/qiime2-q2cli-err-utwt1cmu.log
## Citation
If used this script, please cited:
Evan Bolyen, Jai Ram Rideout, Matthew R. Dillon, Nicholas A. Bokulich, Christian C. Abnet, Gabriel A. Al-Ghalith, Harriet Alexander, Eric J. Alm, Manimozhiyan Arumugam, Francesco Asnicar, Yang Bai, Jordan E. Bisanz, Kyle Bittinger, Asker Brejnrod, Colin J. Brislawn, C. Titus Brown, Benjamin J. Callahan, Andrés Mauricio Caraballo-Rodríguez, John Chase, Emily K. Cope, Ricardo Da Silva, Christian Diener, Pieter C. Dorrestein, Gavin M. Douglas, Daniel M. Durall, Claire Duvallet, Christian F. Edwardson, Madeleine Ernst, Mehrbod Estaki, Jennifer Fouquier, Julia M. Gauglitz, Sean M. Gibbons, Deanna L. Gibson, Antonio Gonzalez, Kestrel Gorlick, Jiarong Guo, Benjamin Hillmann, Susan Holmes, Hannes Holste, Curtis Huttenhower, Gavin A. Huttley, Stefan Janssen, Alan K. Jarmusch, Lingjing Jiang, Benjamin D. Kaehler, Kyo Bin Kang, Christopher R. Keefe, Paul Keim, Scott T. Kelley, Dan Knights, Irina Koester, Tomasz Kosciolek, Jorden Kreps, Morgan G. I. Langille, Joslynn Lee, Ruth Ley, **Yong-Xin Liu**, Erikka Loftfield, Catherine Lozupone, Massoud Maher, Clarisse Marotz, Bryan D. Martin, Daniel McDonald, Lauren J. McIver, Alexey V. Melnik, Jessica L. Metcalf, Sydney C. Morgan, Jamie T. Morton, Ahmad Turan Naimey, Jose A. Navas-Molina, Louis Felix Nothias, Stephanie B. Orchanian, Talima Pearson, Samuel L. Peoples, Daniel Petras, Mary Lai Preuss, Elmar Pruesse, Lasse Buur Rasmussen, Adam Rivers, Michael S. Robeson, Patrick Rosenthal, Nicola Segata, Michael Shaffer, Arron Shiffer, Rashmi Sinha, Se Jin Song, John R. Spear, Austin D. Swafford, Luke R. Thompson, Pedro J. Torres, Pauline Trinh, Anupriya Tripathi, Peter J. Turnbaugh, Sabah Ul-Hasan, Justin J. J. van der Hooft, Fernando Vargas, Yoshiki Vázquez-Baeza, Emily Vogtmann, Max von Hippel, William Walters, Yunhu Wan, Mingxun Wang, Jonathan Warren, Kyle C. Weber, Charles H. D. Williamson, Amy D. Willis, Zhenjiang Zech Xu, Jesse R. Zaneveld, Yilong Zhang, Qiyun Zhu, Rob Knight, J. Gregory Caporaso. 2019. Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2. **Nature Biotechnology** 37: 852-857. https://doi.org/10.1038/s41587-019-0209-9
Copyright 2016-2023 Yong-Xin Liu <[email protected]> 测试整理