config.yaml

samples:                            # List of sample categories to be analysed
    - name: .*-16S                  # Regex expression of sample names to be analysed (reads/original/.*-16S_R1.fastq.gz)
      reference: silva-16S          # Reference genome for reads in the category (reference/silva-16S/silva-16S.fa)
    - name: .*-ITS                  # Another category of samples, with names that end with "-ITS"
      reference: unite              # Reference genome for reads in the category (reference/unite/unite.fa)

report_dir: report/example          # Generated reports and essential output files would be stored there
threads: 16                         # Number of threads to use in analysis

email:                              # Setup email client (will not send emails if not specified)
    setup:                          # Setup the sending
        sendto:                     # Receiver address(es)
            - mkmarcelgg@gmail.com
        gmail:                      # Setup gmail account for sending (the emails will look as to come from this address) - if not provided, try to send through linux "sendmail" command
            login_name: "snakelines.mailclient@gmail.com"   # gmail address for sending emails
            login_pass: "hesielko"                          # gmail password for this address
    onsuccess:                      # Setup emails to send if the analysis succeed
        send: True                  # Send only if true
        list_files: False           # Include list of all generated files
        list_copied: False          # Include list of all copied files
    onerror:                        # Setup emails to send if the analysis failed
        send: True                  # Send only if true
        list_files: True            # Include list of files that should have been generated

reference:                          # Reference processing - indexing, ...
    index:                          # Index fasta file to quicker retrieve substring sequences
        dict:                       # Create sequence dictionary index from fasta file to quicker retrieve substring sequences
            method: picard          # Supported values: picard
        fai:                        # Index fasta file to quicker retrieve substring sequences
            method: samtools        # Supported values: samtools
            reference:                                 # Prepare and analyse reference sequences
    alignment:                      # Multiple alignment of reference sequences
        method: mafft               # Supported values: mafft

    phylogeny:                      # Assess phylogenetic relationship between sequences
        method: iqtree              # Supported values: iqtree
        model: GTR+I+G4             # Model to use for phylo tree generation - see iqtree documentation

    report:                         # Visually assess relationship between reference sequences
        phylogenetic_tree:          # Visual inspection of distances between sequences in tree graph structure
            method: phylo           # Supported values: phylo

        comparison:                 # Interactive HTML visualization of multiple alignment
            method: msaviewer       # Supported values: msaviewer

reads:                              # Prepare reads and quality reports for downstream analysis
    preprocess:                     # Pre-process of reads, eliminate sequencing artifacts, contamination ...
        original:                   # Original reads without preprocessing
            temporary: False        # If True, files would be removed after successful analysis
        trimmed:                    # Remove low quality parts of reads
            method: trimmomatic     # Supported values: trimmomatic
            temporary: False        # If True, generated files would be removed after successful analysis
            crop: 500               # Maximal number of bases in read to keep. Longer reads would be truncated.
            quality: 20             # Minimal average quality of read bases to keep (inside sliding window of length 5)
            headcrop: 20            # Number of bases to remove from the start of read
            minlen: 35              # Minimal length of trimmed read. Shorter reads would be removed.
            additional_param: ""    # Additional params sent to trimming tool
        decontaminated:             # Eliminate fragments from known artificial source, e.g. contamination by human
            method: bowtie2         # Supported values: bowtie2
            temporary: False        # If True, generated files would be removed after successful analysis
            references:             # List of reference genomes
                - unite
            keep: True              # Keep reads mapped to references (True) or remove them as contamination (False)
        deduplicated:               # Remove fragments with the same sequence (PCR duplicated)
            method: fastuniq        # Supported values: fastuniq
            temporary: False        # If True, generated files would be removed after successful analysis
        subsampled:                 # Randomly select subset of reads
            method: seqtk           # Supported values: seqtk
            n_reads: 10             # Number of reads to select
            seed: 1                 # Seed for the random number generator (for analysis reproducibility)
            temporary: False        # If True, generated files would be removed after successful analysis
        joined:                     # Join paired reads into single end reads based on sequence overlap
            method: pear            # Supported values: pear
            temporary: False        # If True, generated files would be removed after successful analysis
    report:                         # Summary reports of read characteristics to assess their quality
        quality_report:             # HTML summary report of read quality
            method: fastqc          # Supported values: fastqc
            read_types:             # List of preprocess steps for quality reports
                - original
                - trimmed

mapping:                            # Find the most similar genomic region to reads in reference (mapping process)
    mapper:                         # Method for mapping
        method: bowtie2             # Supported values: bwtie2, bwa
        params: --very-sensitive    # Additional parameters for mapper
        only_concordant: False      # Keep only reads with consistently mapped reads from both paired-ends
    index:                          # Generate .bai index for mapped reads in .bam files
        method: samtools            # Supported values: samtools
    postprocess:                    # Successive steps to refine mapped reads
        original:                   # Reads retrieved from mapping process
            temporary: True         # If True, generated files would be removed after successful analysis
        sorted:                     # Sorted reads
            method: samtools        # Supported values: samtools
            temporary: False        # If True, generated files would be removed after successful analysis
        deduplicated:               # Mark duplicated reads (PCR duplicated)
            method: picard          # Supported values: picard
            temporary: False        # If True, generated files would be removed after successful analysis
    report:                         # Summary reports of mapping process and results
        quality_report:             # HTML summary with quality of mappings
            method: qualimap        # Supported values: qualimap
            map_types:              # List of post-process steps for quality reports
                - deduplicated

variant:                                    # Identify variation in reads given reference genome
    caller:                                 # Method for variant identification
        method: vardict                     # Supported values: vardict
        hard_filter:                        # Variants that do not pass any of these filters would NOT be present in the VCF file
            min_nonref_allele_freq: 0.05    # Minimal proportion of reads with alternative allele against all observations
            min_alternate_count: 2          # Minimal number of reads with alternative allele
            min_map_quality: 15             # Minimal average mapping quality of reads with alternative allele
        soft_filter:                        # Failing these filters would be indicated in the FILTER field of the VCF file
            min_map_quality: 20             # Minimal average mapping quality of reads with alternative allele
            read_depth: 10                  # Minimal number of reads with alternative allele
            min_nonref_allele_freq: 0.20    # Minimal proportion of reads with alternative allele against all observations
            min_mean_base_quality: 20       # Minimal average base quality of bases that support alternative allele
    report:                                 # Reports for variants
        calling:                            # Report for variant calling
            method: gatk                    # Supported values: gatk
        summary:                            # Summary report for variants
            method: custom                  # Supported values: custom

classification:                     # Identify genomic source of sequenced reads
    read_based:                     # Find homologue sequences by comparing reads to reference sequences
        method: metaxa2             # Supported values: metaxa2
        confidence: 0.8             # Reliability cutoff for taxonomic classification
    contig_based:                             # Find homologue sequences based on assembled contigs
        method: blast                         # Supported values: blast
        reference:                            # List of reference genomes to search for homology
            mhv:                              # Name of reference genome (reference/mhv/mhv.fa)
                query_type: nucleotide        # Nucleotide or protein, according to sequence type in input .fa files
                target_type: nucleotide       # Nucleotide or protein, according to sequence type in blast database
                max_target_seqs: 10           # Number of best hit reference sequences from blast database for each input sequence
    viral:                                    # Customized methods for identification of viruses
        identification:                       # Identification of contigs with similarity to viral genomes
            method: virfinder                 # Supported values: virfinder
    report:                         # Summary reports of classification results
        taxonomic_counts:           # Number of reads mapped to each taxonomic unit (report for read_based classification)
            pieplot:                # Visualisation in pie plot form
                method: krona       # Supported values: krona
            count_table:            # Summary table with number of reads per taxonomic unit
                method: custom      # Supported values: custom
                tax_levels:         # List of taxonomic levels for which tables would be generated
                    - class
                    - genus
            barplot:                # Visualisation in bar plot form
                method: custom      # Supported values: custom
                formats:            # Output format of the resulting images
                    - png
                    - svg
                tax_levels:         # List of taxonomic levels for which plots would be generated
                    - class
                    - genus
            alpha_diversity:        # Alpha diversity computation
                method: custom      # Supported values: custom
                tax_levels:         # List of taxonomic levels for which alpha diversities would be generated
                    - class
                    - genus
        summary:                              # Aggregated HTML table with summarized attributes of contigs and homology (report for contig_based classification)
            method: fasta_summary             # Supported values: fasta_summary
            max_query_seqs: 20000             # Maximal number of contigs to report (ordered by their length)
            max_target_seqs: 5                # Maximal number of homologues from reference genomes to report
            min_query_coverage: 0.01          # Show only hits that have at least this proportion of contig mapped to reference
            include:                          # Optional attributes of contigs to report
                - virfinder                   # Probability that contig is from virus
                - coverage                    # Number of aligned reads per contig
                - blast                       # Homologues identified by Blast against specified reference databases
            html:                             # Attributes applicable only for the HTML report, would NOT be used in the TSV table
                seqs_per_page: 100            # Number of table rows (sequences) per page
                sort_by: 'Sequence'           # Rows would be sorted according to values in this column
                sort_how: 'asc'               # Values would be sorted in desc(ending) or asc(ending) order
                columns:                      # Show only these attributes, in this order
                    - Sequence                # Names of contigs with link to fasta files with its sequence
                    - Length                  # Number of bases
                    - Compress ratio          # Complexity of contig sequence, may be used to filter repetitive sequences
                    - Coverage                # Average number of reads covering each base of contig
                    - VirFinder pvalue        # Probability that contig is from viral genome
                    - Homologue link          # Reference sequences with homology
                    - Mapped reads            # Number of mapped reads to contigs

assembly:                           # Join reads into longer sequences (contigs) based on their overlaps
    assembler:                      # Method for joining reads
        method: spades              # Supported values: spades, unicycler
        mode: standard              # Supported values: standard, meta, plasmid, rna, iontorrent
        careful: True               # Can not be combined with the meta mode. Tries to reduce number of mismatches and short indels, longer runtime
    report:                         # Summary reports for assembly process and results
        quality_report:             # Quality of assembled contigs
            method: quast           # Supported values: quast
        assembly_graph:             # Visualisation of overlaps between assembled contigs
            method: bandage         # Supported values: bandage