nextflow_schema.json

{
    "$schema": "http://json-schema.org/draft-07/schema",
    "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
    "title": "epi2me-labs/wf-transcriptomes",
    "description": "Isoform detection and characterisation.",
    "url": "https://github.com/epi2me-labs/wf-transcriptomes",
    "type": "object",
    "definitions": {
        "basic_input_output_options": {
            "title": "Basic Input/Output Options",
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
            "properties": {
                "out_dir": {
                    "type": "string",
                    "format": "directory-path",
                    "default": "output",
                    "description": "Directory for output of all user-facing files."
                },
                "fastq": {
                    "type": "string",
                    "format": "file-path",
                    "demo_data": "${projectDir}/test_data/fastq",
                    "description": "A fastq file or directory containing fastq input files or directories of input files.",
                    "help_text": "If directories named \\\"barcode*\\\" are found under the `--fastq` directory the data is assumed to be multiplex and each barcode directory will be processed independently. If `.fastq(.gz)` files are found under the `--fastq` directory the sample is assumed to not be multiplexed. In this second case `--samples` should be a simple name rather than a CSV file."
                },
                "sample": {
                    "type": "string",
                    "description": "A sample name for non-multiplexed data. Permissible if passing a file or directory of .fastq(.gz)."
                },
                "sample_sheet": {
                    "type": "string",
                    "format": "file-path",
                    "description": "CSV file with columns named `barcode`, `sample_name` and `type`. Permissible if passing a directory containing barcodeXX sub-directories."
                },
                "sanitize_fastq": {
                    "type": "boolean",
                    "description": "Use additional heuristics to identify barcodes from file paths.",
                    "help_text": "Enabling this option will group together files into samples by the presence of strings of the form `barcodeXXX` present in filenames, rather than simply files grouped into directories (as output by MinKNOW and the Guppy basecaller)."
                },
                "ref_genome": {
                    "type": "string",
                    "format": "file-path",
                    "demo_data": "${projectDir}/test_data/SIRV_150601a.fasta",
                    "description": "Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow"
                },
                "ref_annotation": {
                    "type": "string",
                    "format": "file-path",
                    "demo_data": "${projectDir}/test_data/SIRV_isoforms.gtf",
                    "description": "A reference annotation of gff format"
                }
            },
            "required": [
                "fastq"
            ]
        },
        "global_options": {
            "title": "Global options",
            "type": "object",
            "description": "Options for both sub-workflows",
            "properties": {
                "threads": {
                    "type": "integer",
                    "default": 4
                },
                "pychopper_opts": {
                    "type": "string",
                    "description": "Extra pychopper opts",
                    "default": "-m edlib"
                },
                "direct_rna": {
                    "type": "boolean",
                    "description": "Set to true for direct RNA sequencing. Omits the pychopper step.",
                    "default": false
                },
                "bundle_min_reads": {
                    "type": "integer",
                    "description": "Minimum size of bam bundle for parallel processing."
                },
                "stringtie_opts": {
                    "type": "string",
                    "description": "Extra options for stringtie transcript assembly.",
                    "default": " --conservative "
                },
                "isoform_table_nrows": {
                    "type": "integer",
                    "description": "Maximum rows to dispay in the isoform report table",
                    "default": 5000
                },
                "denovo": {
                    "type": "boolean",
                    "description": "Use denovo transcript assembly rather than reference guided",
                    "default": false
                }
            }
        },
        "reference_wf_options": {
            "title": "Options for reference-based workflow",
            "type": "object",
            "description": "Parameters that are used solely for the reference-guided workflow",
            "properties": {
                "plot_gffcmp_stats": {
                    "type": "boolean",
                    "description": "Create a pdf of plots from showing gffcompare results"
                },
                "gffcompare_opts": {
                    "type": "string",
                    "description": "Extra options for gffcompare -r",
                    "default": " -R "
                },
                "minimap_index_opts": {
                    "type": "string",
                    "description": "minimap2 extra indexing options.",
                    "default": "-k14"
                },
                "minimap2_opts": {
                    "type": "string",
                    "description": "minimap2 extra mapping options.",
                    "default": "-uf"
                },
                "minimum_mapping_quality": {
                    "type": "integer",
                    "description": "filter aligned reads by MAPQ quality.",
                    "default": 40
                },
                "poly_context": {
                    "type": "integer",
                    "description": "Region size at end of reads to apply poly(A) filter.",
                    "default": 24
                },
                "max_poly_run": {
                    "type": "integer",
                    "description": "Max poly(A) region allowed with poly_context-sized end regions.",
                    "default": 8
                }
            }
        },
        "denovo_wf_options": {
            "title": "Options for de novo-based workflow",
            "type": "object",
            "description": "Parameters that are used solely for the de novo workflow",
            "properties": {
                "batch_size": {
                    "type": "integer",
                    "description": "Maximum sequences per input batch (-1 means no limit)",
                    "default": -1
                },
                "batch_max_seq": {
                    "type": "integer",
                    "description": "Maximum sequences per input batch (-1 means no limit)",
                    "default": -1
                },
                "cls_mode": {
                    "type": "string",
                    "description": "Clustering mode",
                    "default": "sahlin"
                },
                "kmer_size": {
                    "type": "integer",
                    "description": "Kmer size",
                    "default": 11
                },
                "window_size": {
                    "type": "integer",
                    "description": "Window size",
                    "default": 15
                },
                "min_left_cls": {
                    "type": "integer",
                    "description": "Minimum cluser size in the left batch",
                    "default": 2
                },
                "consensus_period": {
                    "type": "integer",
                    "description": "Consensus period (-1 means no consensus)",
                    "default": 500
                },
                "consensus_minimum": {
                    "type": "integer",
                    "description": "Minimum consensus sample size:",
                    "default": 50
                },
                "consensus_maximum": {
                    "type": "integer",
                    "description": "Maximum consensus sample size",
                    "default": -150
                },
                "min_shared": {
                    "type": "integer",
                    "description": "Minimum number of minimizers shared between read and cluster",
                    "default": 5
                },
                "min_qual": {
                    "description": "Minimum average quality value",
                    "type": "number",
                    "default": 7.0
                },
                "mapped_threshold": {
                    "description": "Minimum mapped fraction of read to be included in cluster",
                    "type": "number",
                    "default": 0.65
                },
                "aligned_threshold": {
                    "type": "number",
                    "description": "Minimum aligned fraction of read to be included in cluster",
                    "default": 0.2
                },
                "min_fraction": {
                    "type": "number",
                    "description": "Minimum fraction of minimizers shared compared to best hit, in order to continue mapping",
                    "default": 0.8
                },
                "min_prob_no_hits": {
                    "type": "number",
                    "description": "Minimum probability for i consecutive minimizers to be different between read and representative",
                    "default": 0.2
                }
            }
        },
        "fusion_detection_options": {
            "title": "Gene fusion detection options",
            "type": "object",
            "description": "Parameters for gene fusion detection",
            "properties": {
                "jaffal_refBase": {
                    "type": "string",
                    "format": "path",
                    "description": "JAFFAl reference genome directory"
                },
                "jaffal_genome": {
                    "type": "string",
                    "description": "Genome reference prefix. e.g. hg38",
                    "default": "hg38"
                },
                "jaffal_annotation": {
                    "type": "string",
                    "description": "Annotation prefix",
                    "default": "genCode22"
                },
                "jaffal_dir": {
                    "type": "string",
                    "format": "path",
                    "description": "Path to JAFFAL git code directory. Defaults is epi2me-labs container location",
                    "default": "/home/epi2melabs/JAFFA"
                }
            }
        },
        "differential_expression_options": {
            "title": "Differential expression options",
            "type": "object",
            "description": "",
            "default": "",
            "properties": {
                "de_analysis": {
                    "type": "boolean",
                    "description": "Run DE anaylsis",
                    "help_text": "Running this requires you to provide at least two replicates for a control and treated sample as well as a condition sheet param."
                },
                "condition_sheet": {
                    "type": "string",
                    "format": "file-path",
                    "description": "csv with (sample, condition, type)",
                    "default": "null"
                },
                "ref_transcriptome": {
                    "type": "string",
                    "default": "null",
                    "format": "file-path",
                    "description": "Transcriptome reference file"
                },
                "min_gene_expr": {
                    "type": "integer",
                    "default": 10,
                    "description": "Minimum gene counts"
                },
                "min_feature_expr": {
                    "type": "integer",
                    "default": 3,
                    "description": "Minimum transcript counts"
                },
                "min_samps_feature_expr": {
                    "type": "integer",
                    "default": 1,
                    "description": "Transcripts expressed in minimum this many samples"
                },
                "min_samps_gene_expr": {
                    "type": "integer",
                    "description": "Genes expressed in minimum this many samples",
                    "default": 3
                }
            }
        },
        "meta_data": {
            "title": "Meta Data",
            "type": "object",
            "description": "",
            "default": "",
            "properties": {
                "report_name": {
                    "type": "string",
                    "default": "report",
                    "description": "Output report filename suffix."
                },
                "disable_ping": {
                    "type": "boolean",
                    "default": false,
                    "description": "Enable to prevent sending a workflow ping."
                }
            }
        },
        "generic_options": {
            "title": "Generic options",
            "type": "object",
            "fa_icon": "far fa-question-circle",
            "description": "Less common options for the pipeline, typically set in a config file.",
            "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.",
            "properties": {
                "help": {
                    "type": "boolean",
                    "description": "Display help text.",
                    "fa_icon": "fas fa-question-circle",
                    "hidden": true
                }
            }
        }
    },
    "allOf": [
        {
            "$ref": "#/definitions/basic_input_output_options"
        },
        {
            "$ref": "#/definitions/global_options"
        },
        {
            "$ref": "#/definitions/reference_wf_options"
        },
        {
            "$ref": "#/definitions/denovo_wf_options"
        },
        {
            "$ref": "#/definitions/fusion_detection_options"
        },
        {
            "$ref": "#/definitions/differential_expression_options"
        },
        {
            "$ref": "#/definitions/meta_data"
        },
        {
            "$ref": "#/definitions/generic_options"
        }
    ],
    "properties": {
        "aws_image_prefix": {
            "type": "string",
            "hidden": true
        },
        "aws_queue": {
            "type": "string",
            "hidden": true
        },
        "wfversion": {
            "type": "string",
            "default": "v0.1.5",
            "hidden": true
        },
        "monochrome_logs": {
            "type": "boolean"
        },
        "validate_params": {
            "type": "boolean",
            "default": true
        },
        "show_hidden_params": {
            "type": "boolean"
        }
    },
    "docs": {
        "intro": "## Introduction\n\nThis workflow identifies RNA isoforms using either cDNA or direct RNA (dRNA) \nOxford Nanopore reads.\n\n### Preprocesing\ncDNA reads are initially preprocessed by [pychopper](https://github.com/epi2me-labs/pychopper) \nfor the identification of full-length reads, as well as trimming and orientation correction (This step is omitted for \n direct RNA reads).\n\n\n### Transcript assembly\n\n#### Reference-aided transcript assembly approach\n* Full length reads are mapped to a supplied reference genome using [minimap2](https://github.com/lh3/minimap2)\n* Transcripts are assembled by [stringtie](http://ccb.jhu.edu/software/stringtie) \nin long read mode (with or without a guide reference annotation) to generate the GFF annotation.\n* The annotation generated by the pipeline is compared to the reference annotation. \nusing [gffcompare](http://ccb.jhu.edu/software/stringtie/gffcompare.shtml)\n\n#### de novo-based transcript assembly (experimental!)\n* Sequence clusters are generated using [isONclust2](https://github.com/nanoporetech/isONclust2)\n  * If a reference genome is supplied, cluster quality metrics are determined by comparing    \n  with clusters generated from a minimap2 alignment.\n* A consensus sequence for each cluster is generated using [spoa](https://github.com/rvaser/spoa)\n* Three rounds of polishing using racon and minimap2 to give a final polished CDS for each gene.\n* Full-length reads are then mapped to these polished CDS.\n* Transcripts are assembled by stringtie as for the reference-based approach.\n* __Note__: This approach is currently not supported with direct RNA reads.\n\n### Fusion gene detection\nFusion gene detection is performed using [JAFFA](https://github.com/Oshlack/JAFFA), with the JAFFAL extension for use \nwith ONT long reads. \n\n### Differential expression analysis\n* Differential expression is done using the transcripts output by the workflow.\n* A non redundant transcriptome is found using the merge function in [stringtie](http://ccb.jhu.edu/software/stringtie).\n* The reads are then aligned to the transcriptome using minimap2 in a splice-aware manner.\n* [salmon](https://github.com/COMBINE-lab/salmon) is used for transcript quantification.\n* R packages [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and [stageR](https://bioconductor.org/packages/release/bioc/html/stageR.html) are used for differential expression analysis.\n* [DEXSeq](https://bioconductor.org/packages/release/bioc/html/DEXSeq.html) is then used for differential transcript usage analysis.\n\n### Workflow inputs\n- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples\n  (in fastq/fastq.gz format)\n- Reference genome in fasta format (required for reference-based assembly).\n- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).\n- For fusion detection, JAFFAL reference files (see Quickstart) \n",
        "links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [Singularity](https://sylabs.io/singularity/)\n* [conda](https://docs.conda.io/en/latest/miniconda.html)\n* [racon](https://github.com/isovic/racon)\n* [spoa](https://github.com/rvaser/spoa)\n* [inONclust](https://github.com/ksahlin/isONclust)\n* [isONclust2](https://github.com/nanoporetech/isONclust2)"
    }
}