forked from epi2me-labs/wf-transcriptomes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnextflow_schema.json
384 lines (384 loc) · 18.8 KB
/
nextflow_schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
"title": "epi2me-labs/wf-transcriptomes",
"description": "Isoform detection and characterisation.",
"url": "https://github.com/epi2me-labs/wf-transcriptomes",
"type": "object",
"definitions": {
"basic_input_output_options": {
"title": "Basic Input/Output Options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.",
"properties": {
"out_dir": {
"type": "string",
"format": "directory-path",
"default": "output",
"description": "Directory for output of all user-facing files."
},
"fastq": {
"type": "string",
"format": "file-path",
"demo_data": "${projectDir}/test_data/fastq",
"description": "A fastq file or directory containing fastq input files or directories of input files.",
"help_text": "If directories named \\\"barcode*\\\" are found under the `--fastq` directory the data is assumed to be multiplex and each barcode directory will be processed independently. If `.fastq(.gz)` files are found under the `--fastq` directory the sample is assumed to not be multiplexed. In this second case `--samples` should be a simple name rather than a CSV file."
},
"sample": {
"type": "string",
"description": "A sample name for non-multiplexed data. Permissible if passing a file or directory of .fastq(.gz)."
},
"sample_sheet": {
"type": "string",
"format": "file-path",
"description": "CSV file with columns named `barcode`, `sample_name` and `type`. Permissible if passing a directory containing barcodeXX sub-directories."
},
"sanitize_fastq": {
"type": "boolean",
"description": "Use additional heuristics to identify barcodes from file paths.",
"help_text": "Enabling this option will group together files into samples by the presence of strings of the form `barcodeXXX` present in filenames, rather than simply files grouped into directories (as output by MinKNOW and the Guppy basecaller)."
},
"ref_genome": {
"type": "string",
"format": "file-path",
"demo_data": "${projectDir}/test_data/SIRV_150601a.fasta",
"description": "Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow"
},
"ref_annotation": {
"type": "string",
"format": "file-path",
"demo_data": "${projectDir}/test_data/SIRV_isoforms.gtf",
"description": "A reference annotation of gff format"
}
},
"required": [
"fastq"
]
},
"global_options": {
"title": "Global options",
"type": "object",
"description": "Options for both sub-workflows",
"properties": {
"threads": {
"type": "integer",
"default": 4
},
"pychopper_opts": {
"type": "string",
"description": "Extra pychopper opts",
"default": "-m edlib"
},
"direct_rna": {
"type": "boolean",
"description": "Set to true for direct RNA sequencing. Omits the pychopper step.",
"default": false
},
"bundle_min_reads": {
"type": "integer",
"description": "Minimum size of bam bundle for parallel processing."
},
"stringtie_opts": {
"type": "string",
"description": "Extra options for stringtie transcript assembly.",
"default": " --conservative "
},
"isoform_table_nrows": {
"type": "integer",
"description": "Maximum rows to dispay in the isoform report table",
"default": 5000
},
"denovo": {
"type": "boolean",
"description": "Use denovo transcript assembly rather than reference guided",
"default": false
}
}
},
"reference_wf_options": {
"title": "Options for reference-based workflow",
"type": "object",
"description": "Parameters that are used solely for the reference-guided workflow",
"properties": {
"plot_gffcmp_stats": {
"type": "boolean",
"description": "Create a pdf of plots from showing gffcompare results"
},
"gffcompare_opts": {
"type": "string",
"description": "Extra options for gffcompare -r",
"default": " -R "
},
"minimap_index_opts": {
"type": "string",
"description": "minimap2 extra indexing options.",
"default": "-k14"
},
"minimap2_opts": {
"type": "string",
"description": "minimap2 extra mapping options.",
"default": "-uf"
},
"minimum_mapping_quality": {
"type": "integer",
"description": "filter aligned reads by MAPQ quality.",
"default": 40
},
"poly_context": {
"type": "integer",
"description": "Region size at end of reads to apply poly(A) filter.",
"default": 24
},
"max_poly_run": {
"type": "integer",
"description": "Max poly(A) region allowed with poly_context-sized end regions.",
"default": 8
}
}
},
"denovo_wf_options": {
"title": "Options for de novo-based workflow",
"type": "object",
"description": "Parameters that are used solely for the de novo workflow",
"properties": {
"batch_size": {
"type": "integer",
"description": "Maximum sequences per input batch (-1 means no limit)",
"default": -1
},
"batch_max_seq": {
"type": "integer",
"description": "Maximum sequences per input batch (-1 means no limit)",
"default": -1
},
"cls_mode": {
"type": "string",
"description": "Clustering mode",
"default": "sahlin"
},
"kmer_size": {
"type": "integer",
"description": "Kmer size",
"default": 11
},
"window_size": {
"type": "integer",
"description": "Window size",
"default": 15
},
"min_left_cls": {
"type": "integer",
"description": "Minimum cluser size in the left batch",
"default": 2
},
"consensus_period": {
"type": "integer",
"description": "Consensus period (-1 means no consensus)",
"default": 500
},
"consensus_minimum": {
"type": "integer",
"description": "Minimum consensus sample size:",
"default": 50
},
"consensus_maximum": {
"type": "integer",
"description": "Maximum consensus sample size",
"default": -150
},
"min_shared": {
"type": "integer",
"description": "Minimum number of minimizers shared between read and cluster",
"default": 5
},
"min_qual": {
"description": "Minimum average quality value",
"type": "number",
"default": 7.0
},
"mapped_threshold": {
"description": "Minimum mapped fraction of read to be included in cluster",
"type": "number",
"default": 0.65
},
"aligned_threshold": {
"type": "number",
"description": "Minimum aligned fraction of read to be included in cluster",
"default": 0.2
},
"min_fraction": {
"type": "number",
"description": "Minimum fraction of minimizers shared compared to best hit, in order to continue mapping",
"default": 0.8
},
"min_prob_no_hits": {
"type": "number",
"description": "Minimum probability for i consecutive minimizers to be different between read and representative",
"default": 0.2
}
}
},
"fusion_detection_options": {
"title": "Gene fusion detection options",
"type": "object",
"description": "Parameters for gene fusion detection",
"properties": {
"jaffal_refBase": {
"type": "string",
"format": "path",
"description": "JAFFAl reference genome directory"
},
"jaffal_genome": {
"type": "string",
"description": "Genome reference prefix. e.g. hg38",
"default": "hg38"
},
"jaffal_annotation": {
"type": "string",
"description": "Annotation prefix",
"default": "genCode22"
},
"jaffal_dir": {
"type": "string",
"format": "path",
"description": "Path to JAFFAL git code directory. Defaults is epi2me-labs container location",
"default": "/home/epi2melabs/JAFFA"
}
}
},
"differential_expression_options": {
"title": "Differential expression options",
"type": "object",
"description": "",
"default": "",
"properties": {
"de_analysis": {
"type": "boolean",
"description": "Run DE anaylsis",
"help_text": "Running this requires you to provide at least two replicates for a control and treated sample as well as a condition sheet param."
},
"condition_sheet": {
"type": "string",
"format": "file-path",
"description": "csv with (sample, condition, type)",
"default": "null"
},
"ref_transcriptome": {
"type": "string",
"default": "null",
"format": "file-path",
"description": "Transcriptome reference file"
},
"min_gene_expr": {
"type": "integer",
"default": 10,
"description": "Minimum gene counts"
},
"min_feature_expr": {
"type": "integer",
"default": 3,
"description": "Minimum transcript counts"
},
"min_samps_feature_expr": {
"type": "integer",
"default": 1,
"description": "Transcripts expressed in minimum this many samples"
},
"min_samps_gene_expr": {
"type": "integer",
"description": "Genes expressed in minimum this many samples",
"default": 3
}
}
},
"meta_data": {
"title": "Meta Data",
"type": "object",
"description": "",
"default": "",
"properties": {
"report_name": {
"type": "string",
"default": "report",
"description": "Output report filename suffix."
},
"disable_ping": {
"type": "boolean",
"default": false,
"description": "Enable to prevent sending a workflow ping."
}
}
},
"generic_options": {
"title": "Generic options",
"type": "object",
"fa_icon": "far fa-question-circle",
"description": "Less common options for the pipeline, typically set in a config file.",
"help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.",
"properties": {
"help": {
"type": "boolean",
"description": "Display help text.",
"fa_icon": "fas fa-question-circle",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/basic_input_output_options"
},
{
"$ref": "#/definitions/global_options"
},
{
"$ref": "#/definitions/reference_wf_options"
},
{
"$ref": "#/definitions/denovo_wf_options"
},
{
"$ref": "#/definitions/fusion_detection_options"
},
{
"$ref": "#/definitions/differential_expression_options"
},
{
"$ref": "#/definitions/meta_data"
},
{
"$ref": "#/definitions/generic_options"
}
],
"properties": {
"aws_image_prefix": {
"type": "string",
"hidden": true
},
"aws_queue": {
"type": "string",
"hidden": true
},
"wfversion": {
"type": "string",
"default": "v0.1.5",
"hidden": true
},
"monochrome_logs": {
"type": "boolean"
},
"validate_params": {
"type": "boolean",
"default": true
},
"show_hidden_params": {
"type": "boolean"
}
},
"docs": {
"intro": "## Introduction\n\nThis workflow identifies RNA isoforms using either cDNA or direct RNA (dRNA) \nOxford Nanopore reads.\n\n### Preprocesing\ncDNA reads are initially preprocessed by [pychopper](https://github.com/epi2me-labs/pychopper) \nfor the identification of full-length reads, as well as trimming and orientation correction (This step is omitted for \n direct RNA reads).\n\n\n### Transcript assembly\n\n#### Reference-aided transcript assembly approach\n* Full length reads are mapped to a supplied reference genome using [minimap2](https://github.com/lh3/minimap2)\n* Transcripts are assembled by [stringtie](http://ccb.jhu.edu/software/stringtie) \nin long read mode (with or without a guide reference annotation) to generate the GFF annotation.\n* The annotation generated by the pipeline is compared to the reference annotation. \nusing [gffcompare](http://ccb.jhu.edu/software/stringtie/gffcompare.shtml)\n\n#### de novo-based transcript assembly (experimental!)\n* Sequence clusters are generated using [isONclust2](https://github.com/nanoporetech/isONclust2)\n * If a reference genome is supplied, cluster quality metrics are determined by comparing \n with clusters generated from a minimap2 alignment.\n* A consensus sequence for each cluster is generated using [spoa](https://github.com/rvaser/spoa)\n* Three rounds of polishing using racon and minimap2 to give a final polished CDS for each gene.\n* Full-length reads are then mapped to these polished CDS.\n* Transcripts are assembled by stringtie as for the reference-based approach.\n* __Note__: This approach is currently not supported with direct RNA reads.\n\n### Fusion gene detection\nFusion gene detection is performed using [JAFFA](https://github.com/Oshlack/JAFFA), with the JAFFAL extension for use \nwith ONT long reads. \n\n### Differential expression analysis\n* Differential expression is done using the transcripts output by the workflow.\n* A non redundant transcriptome is found using the merge function in [stringtie](http://ccb.jhu.edu/software/stringtie).\n* The reads are then aligned to the transcriptome using minimap2 in a splice-aware manner.\n* [salmon](https://github.com/COMBINE-lab/salmon) is used for transcript quantification.\n* R packages [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and [stageR](https://bioconductor.org/packages/release/bioc/html/stageR.html) are used for differential expression analysis.\n* [DEXSeq](https://bioconductor.org/packages/release/bioc/html/DEXSeq.html) is then used for differential transcript usage analysis.\n\n### Workflow inputs\n- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples\n (in fastq/fastq.gz format)\n- Reference genome in fasta format (required for reference-based assembly).\n- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).\n- For fusion detection, JAFFAL reference files (see Quickstart) \n",
"links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [Singularity](https://sylabs.io/singularity/)\n* [conda](https://docs.conda.io/en/latest/miniconda.html)\n* [racon](https://github.com/isovic/racon)\n* [spoa](https://github.com/rvaser/spoa)\n* [inONclust](https://github.com/ksahlin/isONclust)\n* [isONclust2](https://github.com/nanoporetech/isONclust2)"
}
}