-
Notifications
You must be signed in to change notification settings - Fork 35
/
config.yml
281 lines (208 loc) · 7.92 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
--- # HGTector 2.0 configuration file
# This file defines the default values of some command-line arguments.
# The program will sequentially look for "config.yml" in the following
# locations:
# 1. current directory
# 2. home directory, under subdirectory ".hgtector"
# 3. program directory
## Database locations
database:
# reference protein sequence database for DIAMOND
diamond:
# reference protein sequence database for BLASTp
blast:
# directory of taxonomy database files (NCBI-style nodes.dmp and names.dmp)
# if omitted, the program will retrieve information from remote server
taxdump:
# sequence Id to taxId mapping file (e.g., NCBI's prot.accession2taxid)
# not necessary if protein database already contains taxonomy information
taxmap:
## External program executables
program:
# DIAMOND executable
diamond:
# BLAST executables
blastp:
blastdbcmd:
makeblastdb:
## Remote server URLs
server:
search: https://blast.ncbi.nlm.nih.gov/Blast.cgi
selfaln: https://blast.ncbi.nlm.nih.gov/BlastAlign.cgi
fetch: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
## Sequence homology search
# each protein sequence will be searched against a reference database
search:
# search method
# options: auto, diamond, blast, remote, precomp (pre-computed result)
# if auto, priority will be precomp > diamond > blast > remote, if
# available
method: auto
# gene (protein) filtering
minsize: 30 # minimum size (aa) of a valid protein
# search cutoffs
maxseqs: 500 # maximum number of sequences to return
evalue: 1.0e-5 # maximum E-value cutoff (note: keep decimal point)
identity: 0 # minimum percent identity cutoff
coverage: 0 # minimum percent query coverage cutoff
# hits filtering
maxhits: 0 # maximum number of hits to preserve (0 for unlimited)
# self-alignment method
# options: auto, native, fast, lookup, precomp
# - native: use choice of search method (diamond, blast or remote)
# - fast: use built-in algorithm to calculate blast bit score
# - lookup: search result already contains self-alignment result
# - auto: priority is precomp > native > fast, if available
selfaln: auto
# local search behavior
local:
threads: 0 # number of threads (0 for all CPU cores)
tmpdir: # temporary directory
# DIAMOND search behavior
diamond:
queries: 0 # number of queries per run (0 for whole sample)
maxchars: 0 # maximum number of characters per run (0 for unlimited)
extrargs: # extra command-line arguments for diamond
## BLAST search behavior
blast:
queries: 100 # number of queries per run
maxchars: 100000 # maximum number of characters per run
extrargs: # extra command-line arguments for blastp
## Remote search behavior
remote:
# remote search database
# options: nr, refseq_select_prot, refseq_protein, swissprot, pdb, etc.
db: refseq_select_prot
queries: 0 # number of queries per run
maxchars: 7000 # maximum number of characters per run (note: a valid URL
# typically cannot exceed 8,000 characters)
retries: 5 # maximum number of retries per search
delay: 60 # seconds between two search requests
timeout: 1800 # seconds before program gives up waiting
# extra URL arguments for remote search
# the following default setting means:
# * word size is 6 (for amino acids)
# * filter out low-complexity regions
# * limit search to cellular organisms (TaxID: 131567)
# * exclude uncultured/environmental sample sequences
# * don't attempt to retrieve NCBI GI
extrargs: "&WORD_SIZE=6&FILTER=m%20S&ENTREZ_QUERY=txid131567+%5BORGN%5D&EXCLUDE_SEQ_UNCULT=on&NCBI_GI=false"
## Fetch information from remote server
fetch:
# whether to enable remote fetch
# options: auto, yes, no
# if auto, only enable when search is remote or taxdump is not provided
enable: auto
# remote query behavior
queries: 100 # maximum number of query entries per search
retries: 3 # maximum number of retries per search
delay: 5 # seconds between two fetch requests
timeout: 60 # seconds before program gives up waiting
## Download database files from remote server
download:
retries: 3 # maximum number of retries per file
delay: 10 # seconds between retries
timeout: 60 # seconds before program gives up waiting
## Taxonomic filtering
taxonomy:
# include taxIds equal to or as children of any of the following taxIds
# i.e., limit the search range within those taxIds
include:
# examples:
# - 2 # Bacteria
# - 2157 # Archaea
# - 2759 # Eukaryota
# exclude taxIds equal to or as children of any of the following taxIds
exclude:
# examples:
# - 10239 # Viruses
# ignore more than one hit with same taxId (i.e., potential paralogs)
unique: yes
# ignore more than one hit assigned to the same taxonomic rank
unirank: # example: species
# ignore taxon names that are not capitalized
capital: yes
# ignore species names that are not Latinate
latin: no
# ignore taxon names that contain any of the following words
block:
- unknown
- uncultured
- unidentified
- unclassified
- unresolved
- environmental
- plasmid
- vector
- synthetic
- phage
## Hit filtering for analysis
analyze:
# one may test multiple sets of filters that are more stringent than the
# homology search parameters (see above), so that one does not need to repeat
# the expensive search step
maxhits:
evalue:
identity:
coverage:
## Taxonomic grouping
grouping:
# assign an input genome to a taxon if it represents this percentage or more
# best hits (50-100)
inputcov: 75
# "self" group must be at or above this taxonomic rank (e.g., species)
selfrank:
# "close" group must have this number or more taxa (larger is more
# statistically informative, but consider biological question)
closesize: 10
## HGT prediction statistics
predict:
## gene score calculation
# weigh genes by bit scores (recommended); otherwise simply count occurrence
weighted: yes
## gene filtering
# detect and remove outliers at the high end using this method
# options: none, zscore, boxplot
outliers: zscore
# keep orphans (a.k.a. ORFans; gene without non-self hits)
orphans: no
## kernel density estimation (KDE)
# kernel bandwidth value or method for estimation
# options: auto, grid, or a number between 0.1 and 1.0
# auto: a built-in algorithm that tests bandwidth values from high to low
# until an "atypical" cluster at the low end can be identified
# grid: grid search optimization with cross validation to determine the best
# bandwidth
# number: larger is smoother; smaller is sharper
bandwidth: auto
# number of steps for kernel bandwidth optimization,
# larger is more accurate, but slower
bwsteps: 20
# "atypical" part cannot exceed this quantile in automatic bandwidth
# estimation
# lower is more accurate but may fail in more instances
lowpart: 75
# exclude this percentage of valley-to-peak distance from "atypical" cluster
# larger is more stringent
noise: 50
# if clustering threshold cannot be determined by statistics, mandatorily
# assign this quantile from low end as "atypical" part
fixed: 25
## cluster refinement
# drop data points with silhouette score below this threshold AND at the
# ambiguous side of centroid (i.e., uncertain classification)
silhouette: 0.5
# an additional criterion for HGT prediction: "self" score must be atypically
# low
selflow: no
## Potential donor reporting
donor:
# find a taxId that best describes the potential donor of a gene if it is
# HGT-derived; it is the LCA of distal hits with bit score at most this
# percentage lower than the best distal hit
distaltop: 10
# report taxon name instead of taxId
name: no
# report donor at this rank
rank:
...