-
Notifications
You must be signed in to change notification settings - Fork 15
/
config.yaml
87 lines (77 loc) · 3.95 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Specify sample names and read files
## tab-delimited table with three columns, see manual/usage.md
## sample_name read1_file [read2_file]
## if paired end, all samples must be paired end
## if single end, all samples must be single end
## If this file is specified, sample_reports_file must not be specified.
sample_reads_file: ''
# Specify file containing sample groups.
## Tab delimited with two columns, see manual/downstream_plotting.md
## If empty string specified, all samples will be treated as one group.
sample_groups_file: ''
# OPTIONAL: file containing sample reports
## Use this with Snakefile_downstream_only to do the downstream processing
## and figure generation steps from pipeline results that already exist.
## tab-delimited table with three columns
## sample_name kraken_report [bracken_report]
## if only kraken_report is listed, run_bracken below must be false
## If this file is specified, sample_reads_file must not be specified.
sample_reports_file: ''
# Output base directory - will contain subfolders for reports, plots, etc
outdir: ''
# Do Bracken calculations and use the results in downstream processing?
run_bracken: True
# Read length of the input data
## Used in the Bracken species reestimation. You must have a Bracken database built with this length
read_length: 150
# Confidence threshold: a recently introduced feature for Kraken
## Increase this to reduce false positives for Kraken
## Maximum 1.0
## see: https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#confidence-scoring
confidence_threshold: 0.0
# Taxonomic level for Bracken results.
## Currently, only "S" is supported.
taxonomic_level: S
# Database
## Must contain a Kraken2 and Bracken database
## See database options below if you are working on SCG.
## manual/databases.md has more information
## Default Bhatt lab database: /labs/asbhatt/data/program_indices/kraken2/kraken_custom_dec2021/genbank_genome_chromosome_scaffold
database: ''
######### Downstream processing options ###################
# Downstream processing for filtering OTUs
## This removes any classification result where all samples are below this threshold,
## which serves to remove the very long tail of lowly abundant species in Kraken2/Bracken results.
## This parameter can be tuned, but I recommend you keep something here to reduce the long tail.
## This parameter is defined as a percentage, so 0.001 = 0.001%
min_otu_percentage: 0.001
# Filters for compositional data analysis, which get applied at each taxonomic level
## Keep only those samples with > 5000 reads
codata_min_reads: 5000
## Keep only OTUs with an PROPORTION of at least 0.001
## This is equivalent to a PERCENTAGE of 0.1
codata_min_prop: 0.001
## Keep OTUs that are found in at least 30% of samples
codata_otu_cutoff: 0.3
# Remove everything in the phylum Chordata?
## This helps to remove pesky human reads from the matrices and plots
remove_chordata: True
###########################################################
########### Extracting unmapped reads #####################
# Extract unmapped reads from this database for use with
## another database. Will be in output folder outdir/unmapped_reads
extract_unmapped: False
###########################################################
########### Database alternatives on SCG (see README): ####
# Genbank 2021, viral sequences only
# /labs/asbhatt/data/program_indices/kraken2/kraken_custom_dec2021/genbank_viral
# Refseq 2021
# /labs/asbhatt/data/program_indices/kraken2/kraken_custom_dec2021/refseq
# Refseq 2021, viral sequences only
# /labs/asbhatt/data/program_indices/kraken2/kraken_custom_dec2021/refseq_viral
# Genbank 2020 database used in many past analyses:
# /labs/asbhatt/data/program_indices/kraken2/kraken_custom_jan2020/genbank_genome_chromosome_scaffold
# (and other databases within this parent folder)
# UHGG v1.0 database:
# /oak/stanford/scg/lab_asbhatt/data/program_indices/kraken2/uhgg
###########################################################