-
Notifications
You must be signed in to change notification settings - Fork 3
/
config.yaml
189 lines (178 loc) · 14.8 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
samples: # List of sample categories to be analysed
- name: .*-16S # Regex expression of sample names to be analysed (reads/original/.*-16S_R1.fastq.gz)
reference: silva-16S # Reference genome for reads in the category (reference/silva-16S/silva-16S.fa)
- name: .*-ITS # Another category of samples, with names that end with "-ITS"
reference: unite # Reference genome for reads in the category (reference/unite/unite.fa)
report_dir: report/example # Generated reports and essential output files would be stored there
threads: 16 # Number of threads to use in analysis
email: # Setup email client (will not send emails if not specified)
setup: # Setup the sending
sendto: # Receiver address(es)
gmail: # Setup gmail account for sending (the emails will look as to come from this address) - if not provided, try to send through linux "sendmail" command
login_name: "[email protected]" # gmail address for sending emails
login_pass: "hesielko" # gmail password for this address
onsuccess: # Setup emails to send if the analysis succeed
send: True # Send only if true
list_files: False # Include list of all generated files
list_copied: False # Include list of all copied files
onerror: # Setup emails to send if the analysis failed
send: True # Send only if true
list_files: True # Include list of files that should have been generated
reference: # Reference processing - indexing, ...
index: # Index fasta file to quicker retrieve substring sequences
dict: # Create sequence dictionary index from fasta file to quicker retrieve substring sequences
method: picard # Supported values: picard
fai: # Index fasta file to quicker retrieve substring sequences
method: samtools # Supported values: samtools
reference: # Prepare and analyse reference sequences
alignment: # Multiple alignment of reference sequences
method: mafft # Supported values: mafft
phylogeny: # Assess phylogenetic relationship between sequences
method: iqtree # Supported values: iqtree
model: GTR+I+G4 # Model to use for phylo tree generation - see iqtree documentation
report: # Visually assess relationship between reference sequences
phylogenetic_tree: # Visual inspection of distances between sequences in tree graph structure
method: phylo # Supported values: phylo
comparison: # Interactive HTML visualization of multiple alignment
method: msaviewer # Supported values: msaviewer
reads: # Prepare reads and quality reports for downstream analysis
preprocess: # Pre-process of reads, eliminate sequencing artifacts, contamination ...
original: # Original reads without preprocessing
temporary: False # If True, files would be removed after successful analysis
trimmed: # Remove low quality parts of reads
method: trimmomatic # Supported values: trimmomatic
temporary: False # If True, generated files would be removed after successful analysis
crop: 500 # Maximal number of bases in read to keep. Longer reads would be truncated.
quality: 20 # Minimal average quality of read bases to keep (inside sliding window of length 5)
headcrop: 20 # Number of bases to remove from the start of read
minlen: 35 # Minimal length of trimmed read. Shorter reads would be removed.
additional_param: "" # Additional params sent to trimming tool
decontaminated: # Eliminate fragments from known artificial source, e.g. contamination by human
method: bowtie2 # Supported values: bowtie2
temporary: False # If True, generated files would be removed after successful analysis
references: # List of reference genomes
- unite
keep: True # Keep reads mapped to references (True) or remove them as contamination (False)
deduplicated: # Remove fragments with the same sequence (PCR duplicated)
method: fastuniq # Supported values: fastuniq
temporary: False # If True, generated files would be removed after successful analysis
subsampled: # Randomly select subset of reads
method: seqtk # Supported values: seqtk
n_reads: 10 # Number of reads to select
seed: 1 # Seed for the random number generator (for analysis reproducibility)
temporary: False # If True, generated files would be removed after successful analysis
joined: # Join paired reads into single end reads based on sequence overlap
method: pear # Supported values: pear
temporary: False # If True, generated files would be removed after successful analysis
report: # Summary reports of read characteristics to assess their quality
quality_report: # HTML summary report of read quality
method: fastqc # Supported values: fastqc
read_types: # List of preprocess steps for quality reports
- original
- trimmed
mapping: # Find the most similar genomic region to reads in reference (mapping process)
mapper: # Method for mapping
method: bowtie2 # Supported values: bwtie2, bwa
params: --very-sensitive # Additional parameters for mapper
only_concordant: False # Keep only reads with consistently mapped reads from both paired-ends
index: # Generate .bai index for mapped reads in .bam files
method: samtools # Supported values: samtools
postprocess: # Successive steps to refine mapped reads
original: # Reads retrieved from mapping process
temporary: True # If True, generated files would be removed after successful analysis
sorted: # Sorted reads
method: samtools # Supported values: samtools
temporary: False # If True, generated files would be removed after successful analysis
deduplicated: # Mark duplicated reads (PCR duplicated)
method: picard # Supported values: picard
temporary: False # If True, generated files would be removed after successful analysis
report: # Summary reports of mapping process and results
quality_report: # HTML summary with quality of mappings
method: qualimap # Supported values: qualimap
map_types: # List of post-process steps for quality reports
- deduplicated
variant: # Identify variation in reads given reference genome
caller: # Method for variant identification
method: vardict # Supported values: vardict
hard_filter: # Variants that do not pass any of these filters would NOT be present in the VCF file
min_nonref_allele_freq: 0.05 # Minimal proportion of reads with alternative allele against all observations
min_alternate_count: 2 # Minimal number of reads with alternative allele
min_map_quality: 15 # Minimal average mapping quality of reads with alternative allele
soft_filter: # Failing these filters would be indicated in the FILTER field of the VCF file
min_map_quality: 20 # Minimal average mapping quality of reads with alternative allele
read_depth: 10 # Minimal number of reads with alternative allele
min_nonref_allele_freq: 0.20 # Minimal proportion of reads with alternative allele against all observations
min_mean_base_quality: 20 # Minimal average base quality of bases that support alternative allele
report: # Reports for variants
calling: # Report for variant calling
method: gatk # Supported values: gatk
summary: # Summary report for variants
method: custom # Supported values: custom
classification: # Identify genomic source of sequenced reads
read_based: # Find homologue sequences by comparing reads to reference sequences
method: metaxa2 # Supported values: metaxa2
confidence: 0.8 # Reliability cutoff for taxonomic classification
contig_based: # Find homologue sequences based on assembled contigs
method: blast # Supported values: blast
reference: # List of reference genomes to search for homology
mhv: # Name of reference genome (reference/mhv/mhv.fa)
query_type: nucleotide # Nucleotide or protein, according to sequence type in input .fa files
target_type: nucleotide # Nucleotide or protein, according to sequence type in blast database
max_target_seqs: 10 # Number of best hit reference sequences from blast database for each input sequence
viral: # Customized methods for identification of viruses
identification: # Identification of contigs with similarity to viral genomes
method: virfinder # Supported values: virfinder
report: # Summary reports of classification results
taxonomic_counts: # Number of reads mapped to each taxonomic unit (report for read_based classification)
pieplot: # Visualisation in pie plot form
method: krona # Supported values: krona
count_table: # Summary table with number of reads per taxonomic unit
method: custom # Supported values: custom
tax_levels: # List of taxonomic levels for which tables would be generated
- class
- genus
barplot: # Visualisation in bar plot form
method: custom # Supported values: custom
formats: # Output format of the resulting images
- png
- svg
tax_levels: # List of taxonomic levels for which plots would be generated
- class
- genus
alpha_diversity: # Alpha diversity computation
method: custom # Supported values: custom
tax_levels: # List of taxonomic levels for which alpha diversities would be generated
- class
- genus
summary: # Aggregated HTML table with summarized attributes of contigs and homology (report for contig_based classification)
method: fasta_summary # Supported values: fasta_summary
max_query_seqs: 20000 # Maximal number of contigs to report (ordered by their length)
max_target_seqs: 5 # Maximal number of homologues from reference genomes to report
min_query_coverage: 0.01 # Show only hits that have at least this proportion of contig mapped to reference
include: # Optional attributes of contigs to report
- virfinder # Probability that contig is from virus
- coverage # Number of aligned reads per contig
- blast # Homologues identified by Blast against specified reference databases
html: # Attributes applicable only for the HTML report, would NOT be used in the TSV table
seqs_per_page: 100 # Number of table rows (sequences) per page
sort_by: 'Sequence' # Rows would be sorted according to values in this column
sort_how: 'asc' # Values would be sorted in desc(ending) or asc(ending) order
columns: # Show only these attributes, in this order
- Sequence # Names of contigs with link to fasta files with its sequence
- Length # Number of bases
- Compress ratio # Complexity of contig sequence, may be used to filter repetitive sequences
- Coverage # Average number of reads covering each base of contig
- VirFinder pvalue # Probability that contig is from viral genome
- Homologue link # Reference sequences with homology
- Mapped reads # Number of mapped reads to contigs
assembly: # Join reads into longer sequences (contigs) based on their overlaps
assembler: # Method for joining reads
method: spades # Supported values: spades, unicycler
mode: standard # Supported values: standard, meta, plasmid, rna, iontorrent
careful: True # Can not be combined with the meta mode. Tries to reduce number of mismatches and short indels, longer runtime
report: # Summary reports for assembly process and results
quality_report: # Quality of assembled contigs
method: quast # Supported values: quast
assembly_graph: # Visualisation of overlaps between assembled contigs
method: bandage # Supported values: bandage