From 0f44949d629785f1ce386627f9fe1577fb7c5fbd Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Thu, 28 Mar 2024 20:08:30 +0530 Subject: [PATCH] Add salmon (#24) * add salmon index and quant * add test resources * add help text * script and config * add test * update script and test * add salmon quant * update CHANGELOG.md * update to viash 0.9 format * remove echo ststement * output the main salmon output file separately * check if output file has the right columns * check if correct output files were generated * fix doi Co-authored-by: Robrecht Cannoodt * use the default multiple separator * rename components * remove print statements * check info.json output * reduce size of fastq files and generate index in test script * use smaller (manual) test data * set A as the default lib_type * Merge branch 'add_salmon' of https://github.com/viash-hub/biobase into add_salmon * add test to check content of output index * add more detailed description about libType * add test data * delete test data * move to salmon_index and salmon_quant --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 6 +- src/salmon/salmon_index/config.vsh.yaml | 113 +++ src/salmon/salmon_index/help.txt | 66 ++ src/salmon/salmon_index/script.sh | 49 ++ src/salmon/salmon_index/test.sh | 35 + src/salmon/salmon_quant/config.vsh.yaml | 589 ++++++++++++++ src/salmon/salmon_quant/help.txt | 976 ++++++++++++++++++++++++ src/salmon/salmon_quant/script.sh | 151 ++++ src/salmon/salmon_quant/test.sh | 156 ++++ 9 files changed, 2140 insertions(+), 1 deletion(-) create mode 100644 src/salmon/salmon_index/config.vsh.yaml create mode 100644 src/salmon/salmon_index/help.txt create mode 100644 src/salmon/salmon_index/script.sh create mode 100644 src/salmon/salmon_index/test.sh create mode 100644 src/salmon/salmon_quant/config.vsh.yaml create mode 100644 src/salmon/salmon_quant/help.txt create mode 100644 src/salmon/salmon_quant/script.sh create mode 100644 src/salmon/salmon_quant/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 709ca4a0..c3f22e96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,11 @@ * `star/star_align_reads`: Align reads to a reference genome (PR #22). -* `gffread`: Validate, filter, convert and perform other operations on GFF files (PR #29). +* `gffread`: Validate, filter, convert and perform other operations on GFF files (PR #29). + +* `salmon`: + - `salmon/salmon_index`: Create a salmon index for the transcriptome to use Salmon in the mapping-based mode (PR #24). + - `salmon/salmon_quant`: Transcript quantification from RNA-seq data (PR #24). ## MAJOR CHANGES diff --git a/src/salmon/salmon_index/config.vsh.yaml b/src/salmon/salmon_index/config.vsh.yaml new file mode 100644 index 00000000..f24cd3a9 --- /dev/null +++ b/src/salmon/salmon_index/config.vsh.yaml @@ -0,0 +1,113 @@ +name: salmon_index +namespace: salmon +description: | + Salmon is a tool for wicked-fast transcript quantification from RNA-seq data. It can either make use of pre-computed alignments (in the form of a SAM/BAM file) to the transcripts rather than the raw reads, or can be run in the mapping-based mode. This component creates a salmon index for the transcriptome to use Salmon in the mapping-based mode. It is generally recommend that you build a decoy-aware transcriptome file. This is done using the entire genome of the organism as the decoy sequence by concatenating the genome to the end of the transcriptome to be indexed and populating the decoys.txt file with the chromosome names. +keywords: ["Transcriptome", "Index"] +links: + homepage: https://salmon.readthedocs.io/en/latest/salmon.html + documentation: https://salmon.readthedocs.io/en/latest/salmon.html + repository: https://github.com/COMBINE-lab/salmon +references: + doi: 10.1038/nmeth.4197 +license: GPL-3.0 +requirements: + commands: [ salmon ] + +argument_groups: + - name: Inputs + arguments: + - name: --genome + type: file + description: | + Genome of the organism to prepare the set of decoy sequences. Required to build decoy-aware transccriptome. + required: false + example: genome.fasta + - name: --transcripts + alternatives: ["-t"] + type: file + description: | + Transcript fasta file. + required: true + example: transcriptome.fasta + - name: --kmer_len + alternatives: ["-k"] + type: integer + description: | + The size of k-mers that should be used for the quasi index. + required: false + example: 31 + - name: --gencode + type: boolean_true + description: | + This flag will expect the input transcript fasta to be in GENCODE format, and will split the transcript name at the first '|' character. These reduced names will be used in the output and when looking for these transcripts in a gene to transcript GTF. + - name: --features + type: boolean_true + description: | + This flag will expect the input reference to be in the tsv file format, and will split the feature name at the first 'tab' character. These reduced names will be used in the output and when looking for the sequence of the features.GTF. + - name: --keep_duplicates + type: boolean_true + description: | + This flag will disable the default indexing behavior of discarding sequence-identical duplicate transcripts. If this flag is passed, then duplicate transcripts that appear in the input will be retained and quantified separately. + - name: --keep_fixed_fasta + type: boolean_true + description: | + Retain the fixed fasta file (without short transcripts and duplicates, clipped, etc.) generated during indexing. + - name: --filter_size + alternatives: ["-f"] + type: integer + description: | + The size of the Bloom filter that will be used by TwoPaCo during indexing. The filter will be of size 2^{filter_size}. The default value of -1 means that the filter size will be automatically set based on the number of distinct k-mers in the input, as estimated by nthll. + required: false + example: -1 + - name: --sparse + type: boolean_true + description: | + Build the index using a sparse sampling of k-mer positions This will require less memory (especially during quantification), but will take longer to construct and can slow down mapping / alignment. + - name: --decoys + alternatives: ["-d"] + type: file + description: | + Treat these sequences ids from the reference as the decoys that may have sequence homologous to some known transcript. For example in case of the genome, provide a list of chromosome names (one per line). + required: false + example: decoys.txt + - name: --no_clip + type: boolean_true + description: | + Don't clip poly-A tails from the ends of target sequences. + - name: --type + alternatives: ["-n"] + type: string + description: | + The type of index to build; the only option is "puff" in this version of salmon. + required: false + example: puff + + - name: Output + arguments: + - name: --index + alternatives: ["-i"] + type: file + direction: output + description: | + Salmon index + required: true + example: Salmon_index + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: quay.io/biocontainers/salmon:1.10.2--hecfa306_0 + setup: + - type: docker + run: | + salmon index -v 2>&1 | sed 's/salmon \([0-9.]*\)/salmon: \1/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/salmon/salmon_index/help.txt b/src/salmon/salmon_index/help.txt new file mode 100644 index 00000000..bcca44d0 --- /dev/null +++ b/src/salmon/salmon_index/help.txt @@ -0,0 +1,66 @@ +```bash +salmon index -h +``` + +Version Info: This is the most recent version of salmon. + +Index +========== +Creates a salmon index. + +Command Line Options: + -v [ --version ] print version string + -h [ --help ] produce help message + -t [ --transcripts ] arg Transcript fasta file. + -k [ --kmerLen ] arg (=31) The size of k-mers that should be used for the + quasi index. + -i [ --index ] arg salmon index. + --gencode This flag will expect the input transcript + fasta to be in GENCODE format, and will split + the transcript name at the first '|' character. + These reduced names will be used in the output + and when looking for these transcripts in a + gene to transcript GTF. + --features This flag will expect the input reference to be + in the tsv file format, and will split the + feature name at the first 'tab' character. + These reduced names will be used in the output + and when looking for the sequence of the + features.GTF. + --keepDuplicates This flag will disable the default indexing + behavior of discarding sequence-identical + duplicate transcripts. If this flag is passed, + then duplicate transcripts that appear in the + input will be retained and quantified + separately. + -p [ --threads ] arg (=2) Number of threads to use during indexing. + --keepFixedFasta Retain the fixed fasta file (without short + transcripts and duplicates, clipped, etc.) + generated during indexing + -f [ --filterSize ] arg (=-1) The size of the Bloom filter that will be used + by TwoPaCo during indexing. The filter will be + of size 2^{filterSize}. The default value of -1 + means that the filter size will be + automatically set based on the number of + distinct k-mers in the input, as estimated by + nthll. + --tmpdir arg The directory location that will be used for + TwoPaCo temporary files; it will be created if + need be and be removed prior to indexing + completion. The default value will cause a + (temporary) subdirectory of the salmon index + directory to be used for this purpose. + --sparse Build the index using a sparse sampling of + k-mer positions This will require less memory + (especially during quantification), but will + take longer to construct and can slow down + mapping / alignment + -d [ --decoys ] arg Treat these sequences ids from the reference as + the decoys that may have sequence homologous to + some known transcript. for example in case of + the genome, provide a list of chromosome name + --- one per line + -n [ --no-clip ] Don't clip poly-A tails from the ends of target + sequences + --type arg (=puff) The type of index to build; the only option is + "puff" in this version of salmon. diff --git a/src/salmon/salmon_index/script.sh b/src/salmon/salmon_index/script.sh new file mode 100644 index 00000000..c2b9e7a0 --- /dev/null +++ b/src/salmon/salmon_index/script.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +set -e + +## VIASH START +## VIASH END + +[[ "$par_gencode" == "false" ]] && unset par_gencode +[[ "$par_features" == "false" ]] && unset par_features +[[ "$par_keep_duplicates" == "false" ]] && unset par_keep_duplicates +[[ "$par_keep_fixed_fasta" == "false" ]] && unset par_keep_fixed_fasta +[[ "$par_sparse" == "false" ]] && unset par_sparse +[[ "$par_no_clip" == "false" ]] && unset par_no_clip + +tmp_dir=$(mktemp -d -p "$meta_temp_dir" "${meta_functionality_name}_XXXXXX") +mkdir -p "$tmp_dir/temp" + +if [[ -f "$par_genome" ]] && [[ ! "$par_decoys" ]]; then + filename="$(basename -- $par_genome)" + decoys="decoys.txt" + if [ ${filename##*.} == "gz" ]; then + grep '^>' <(gunzip -c $par_genome) | cut -d ' ' -f 1 > $decoys + gentrome="gentrome.fa.gz" + else + grep '^>' $par_genome | cut -d ' ' -f 1 > $decoys + gentrome="gentrome.fa" + fi + sed -i.bak -e 's/>//g' $decoys + cat $par_transcripts $par_genome > $gentrome +else + gentrome=$par_transcripts + decoys=$par_decoys +fi + +salmon index \ + -t "$gentrome" \ + --tmpdir "$tmp_dir/temp" \ + ${meta_cpus:+--threads "${meta_cpus}"} \ + -i "$par_index" \ + ${par_kmer_len:+-k "${par_kmer_len}"} \ + ${par_gencode:+--gencode} \ + ${par_features:+--features} \ + ${par_keep_duplicates:+--keepDuplicates} \ + ${par_keep_fixed_fasta:+--keepFixedFasta} \ + ${par_filter_size:+-f "${par_filter_size}"} \ + ${par_sparse:+--sparse} \ + ${decoys:+-d "${decoys}"} \ + ${par_no_clip:+--no-clip} \ + ${par_type:+--type "${par_type}"} \ No newline at end of file diff --git a/src/salmon/salmon_index/test.sh b/src/salmon/salmon_index/test.sh new file mode 100644 index 00000000..091f11a9 --- /dev/null +++ b/src/salmon/salmon_index/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +echo "> Prepare test data" + +dir_in="test_data" +mkdir -p "$dir_in" + +cat > "$dir_in/transcriptome.fasta" <<'EOF' +>contig1 +AGCTCCAGATTCGCTCAGGCCCTTGATCATCAGTCGTCGTCGTCTTCGATTTGCCAGAGG +AGTTTAGATGAAGAATGTCAAGGATGTTCCTCCCTGCCCTCCCATCTAGCCAAGAACATT +TCCAAGAAGATAAAACTGTCACTGAGACAGGTCTGGATGCGCCCTAGGGGCAAATAGAGA +>contig2 +AGGCCTTTACCACATTGCTGCTGGCTATAGGAAGTCCCAGGTACTAGCCTGAAACAGCTG +ATATTTGGGGCTGTCACAGACAATATGGCCACCCCTTGGTCTTTATGCATGAAGATTATG +TAAAGGTTTTTATTAAAAAATATATATATATATATAAATGATCTAGATTATTTTCCTCTT +TCTGAAGTACTTTCTTAAAAAAATAAAATTAAATGTTTATAGTATTCCCGGT +EOF + +printf ">>> Run salmon_index" +"$meta_executable" \ + --transcripts $dir_in/transcriptome.fasta \ + --index index \ + --kmer_len 31 + +printf ">>> Checking whether output exists" +[ ! -d "index" ] && echo "'index' does not exist!" && exit 1 +[ -z "$(ls -A 'index')" ] && echo "'index' is empty!" && exit 1 +[ ! -f "index/info.json" ] && echo "Salmon index does not contain 'info.json'! Not all files were generated correctly!" && exit 1 +[ $(grep '"k": [0-9]*' index/info.json | cut -d':' -f 2) != '31,' ] && printf "The generated Salmon index seems to be incorrect!" && exit 1 + +echo "All tests succeeded!" +exit 0 \ No newline at end of file diff --git a/src/salmon/salmon_quant/config.vsh.yaml b/src/salmon/salmon_quant/config.vsh.yaml new file mode 100644 index 00000000..47d72665 --- /dev/null +++ b/src/salmon/salmon_quant/config.vsh.yaml @@ -0,0 +1,589 @@ +name: salmon_quant +namespace: salmon +description: | + Salmon is a tool for wicked-fast transcript quantification from RNA-seq data. It can either make use of pre-computed alignments (in the form of a SAM/BAM file) to the transcripts rather than the raw reads, or can be run in the mapping-based mode. +keywords: ["Transcriptome", "Quantification"] +links: + homepage: https://salmon.readthedocs.io/en/latest/salmon.html + documentation: https://salmon.readthedocs.io/en/latest/salmon.html + repository: https://github.com/COMBINE-lab/salmon +references: + doi: "10.1038/nmeth.4197" +license: GPL-3.0 +requirements: + commands: [ salmon ] + +argument_groups: + - name: Common input options + arguments: + - name: --lib_type + alternatives: ["-l"] + type: string + description: | + Format string describing the library. + The library type string consists of three parts: + 1. Relative orientation of the reads: This part is only provided if the library is paired-end, THe possible options are + I = inward + O = outward + M = matching + 2. Strandedness of the library: This part specifies whether the protocol is stranded or unstranded. The options are: + S = stranded + U = unstranded + 3. Directionality of the reads: If the library is stranded, the final part of the library string is used to specify the strand from which the read originates. The possible values are + F = read 1 (or single-end read) comes from the forward strand + R = read 1 (or single-end read) comes from the reverse strand + required: false + default: 'A' + choices: ['A', 'U', 'SF', 'SR', 'IU', 'IS', 'ISF', 'ISR', 'OU', 'OS', 'OSF', 'OSR', 'MU', 'MS', 'MSF', 'MSR'] + - name: Mapping input options + arguments: + - name: --index + alternatives: ["-i"] + type: file + description: | + Salmon index. + required: true + example: transcriptome_index + - name: --unmated_reads + alternatives: ["-r"] + type: file + description: | + List of files containing unmated reads of (e.g. single-end reads). + required: false + multiple: true + example: sample.fq.gz + - name: --mates1 + alternatives: ["-m1"] + type: file + description: | + File containing the #1 mates. + required: false + multiple: true + example: sample_1.fq.gz + - name: --mates2 + alternatives: ["-m2"] + type: file + description: | + File containing the #2 mates. + required: false + multiple: true + example: sample_2.fq.gz + + - name: Alignment input options + arguments: + - name: --discard_orphans + type: boolean_true + description: | + Discard orphan alignments in the input [for alignment-based mode only]. If this flag is passed, then only paired alignments will be considered toward quantification estimates. The default behavior is to consider orphan alignments if no valid paired mappings exist. + - name: --alignments + alternatives: ["-a"] + type: file + description: | + Input alignment (BAM) file(s). + required: false + multiple: true + example: sample.fq.gz + - name: --eqclasses + alternatives: ["-e"] + type: file + description: | + input salmon weighted equivalence class file. + required: false + - name: --targets + alternatives: ["-t"] + type: file + description: | + FASTA format file containing target transcripts. + required: false + example: transcripts.fasta + - name: --ont + type: boolean_true + description: | + Use alignment model for Oxford Nanopore long reads + + - name: Output + arguments: + - name: --output + alternatives: ["-o"] + type: file + direction: output + description: | + Output quantification directory. + required: true + example: quant_output + - name: --quant_results + type: file + direction: output + description: | + Salmon quantification file. + required: true + example: quant.sf + + - name: Basic options + arguments: + - name: --seq_bias + type: boolean_true + description: | + Perform sequence-specific bias correction. + - name: --gc_bias + type: boolean_true + description: | + Perform fragment GC bias correction [beta for single-end reads]. + - name: --pos_bias + type: boolean_true + description: | + Perform positional bias correction. + - name: --incompat_prior + type: double + description: | + Set the prior probability that an alignment that disagrees with the specified library type (--lib_type) results from the true fragment origin. Setting this to 0 specifies that alignments that disagree with the library type should be "impossible", while setting it to 1 says that alignments that disagree with the library type are no less likely than those that do. + required: false + min: 0 + max: 1 + example: 0 + - name: --gene_map + alternatives: ["-g"] + type: file + description: | + File containing a mapping of transcripts to genes. If this file is provided salmon will output both quant.sf and quant.genes.sf files, where the latter contains aggregated gene-level abundance estimates. The transcript to gene mapping should be provided as either a GTF file, or a in a simple tab-delimited format where each line contains the name of a transcript and the gene to which it belongs separated by a tab. The extension of the file is used to determine how the file should be parsed. Files ending in '.gtf', '.gff' or '.gff3' are assumed to be in GTF format; files with any other extension are assumed to be in the simple format. In GTF / GFF format, the "transcript_id" is assumed to contain the transcript identifier and the "gene_id" is assumed to contain the corresponding gene identifier. + required: false + example: gene_map.gtf + - name: --aux_target_file + type: file + description: | + A file containing a list of "auxiliary" targets. These are valid targets (i.e., not decoys) to which fragments are allowed to map and be assigned, and which will be quantified, but for which auxiliary models like sequence-specific and fragment-GC bias correction should not be applied. + required: false + example: auxilary_targets.txt + - name: --meta + type: boolean_true + description: | + If you're using Salmon on a metagenomic dataset, consider setting this flag to disable parts of the abundance estimation model that make less sense for metagenomic data. + - name: --score_exp + type: double + description: | + The factor by which sub-optimal alignment scores are downweighted to produce a probability. If the best alignment score for the current read is S, and the score for a particular alignment is w, then the probability will be computed porportional to exp( - scoreExp * (S-w) ). + required: false + example: 1 + + - name: Options specific to mapping mode + arguments: + - name: --discard_orphans_quasi + type: boolean_true + description: | + [selective-alignment mode only] + Discard orphan mappings in selective-alignment mode. If this flag is passed then only paired mappings will be considered toward quantification estimates. The default behavior is to consider orphan mappings if no valid paired mappings exist. This flag is independent of the option to write the orphaned mappings to file (--writeOrphanLinks). + - name: --consensus_slack + type: double + description: | + [selective-alignment mode only] + The amount of slack allowed in the selective-alignment filtering mechanism. If this is set to a fraction, X, greater than 0 (and in [0,1)), then uniMEM chains with scores below (100 * X)% of the best chain score for a read, and read pairs with a sum of chain scores below (100 * X)% of the best chain score for a read pair will be discounted as a mapping candidates. The default value of this option is 0.35. + required: false + min: 0 + max: 0.999999999 + example: 0.35 + - name: --pre_merge_chain_sub_thresh + type: double + description: | + [selective-alignment mode only] + The threshold of sub-optimal chains, compared to the best chain on a given target, that will be retained and passed to the next phase of mapping. Specifically, if the best chain for a read (or read-end in paired-end mode) to target t has score X_t, then all chains for this read with score >= X_t * preMergeChainSubThresh will be retained and passed to subsequent mapping phases. This value must be in the range [0, 1]. + required: false + min: 0 + max: 1 + example: 0.75 + - name: --post_merge_chain_sub_thresh + type: double + description: | + [selective-alignment mode only] + The threshold of sub-optimal chains, compared to the best chain on a given target, that will be retained and passed to the next phase of mapping. This is different than post_merge_chain_sub_thresh, because this is applied to pairs of chains (from the ends of paired-end reads) after merging (i.e. after checking concordancy constraints etc.). Specifically, if the best chain pair to target t has score X_t, then all chain pairs for this read pair with score >= X_t * post_merge_chain_sub_thresh will be retained and passed to subsequent mapping phases. This value must be in the range [0, 1]. Note: This option is only meaningful for paired-end libraries, and is ignored for single-end libraries. + required: false + min: 0 + max: 1 + example: 0.9 + - name: --orphan_chain_sub_thresh + type: double + description: | + [selective-alignment mode only] + This threshold sets a global sub-optimality threshold for chains corresponding to orphan mappings. That is, if the merging procedure results in no concordant mappings then only orphan mappings with a chain score >= orphan_chain_sub_thresh * bestChainScore will be retained and passed to subsequent mapping phases. This value must be in the range [0, 1]. Note: This option is only meaningful for paired-end libraries, and is ignored for single-end libraries. + required: false + min: 0 + max: 1 + example: 0.95 + - name: --min_score_fraction + type: double + description: | + [selective-alignment mode only] + The fraction of the optimal possible alignment score that a mapping must achieve in order to be considered "valid" --- should be in (0,1]. Default 0.65 + required: false + min: 0.000000001 + max: 1 + example: 0.65 + - name: --mismatch_seed_skip + type: integer + description: | + [selective-alignment mode only] + After a k-mer hit is extended to a uni-MEM, the uni-MEM extension can terminate for one of 3 reasons; the end of the read, the end of the unitig, or a mismatch. If the extension ends because of a mismatch, this is likely the result of a sequencing error. To avoid looking up many k-mers that will likely fail to be located in the index, the search procedure skips by a factor of mismatch_seed_skip until it either (1) finds another match or (2) is k-bases past the mismatch position. This value controls that skip length. A smaller value can increase sensitivity, while a larger value can speed up seeding. + required: false + example: 3 + - name: --disable_chaining_heuristic + type: boolean_true + description: | + [selective-alignment mode only] + By default, the heuristic of (Li 2018) is implemented, which terminates the chaining DP once a given number of valid backpointers are found. This speeds up the seed (MEM) chaining step, but may result in sub-optimal chains in complex situations (e.g. sequences with many repeats and overlapping repeats). Passing this flag will disable the chaining heuristic, and perform the full chaining dynamic program, guaranteeing the optimal chain is found in this step. + - name: --decoy_threshold + type: double + description: | + [selective-alignment mode only] + For an alignemnt to an annotated transcript to be considered invalid, it must have an alignment score < (decoy_threshold * bestDecoyScore). A value of 1.0 means that any alignment strictly worse than the best decoy alignment will be discarded. A smaller value will allow reads to be allocated to transcripts even if they strictly align better to the decoy sequence. + required: false + min: 0 + max: 1 + example: 1 + - name: --ma + type: integer + description: | + [selective-alignment mode only] + The value given to a match between read and reference nucleotides in an alignment. + required: false + example: 2 + - name: --mp + type: integer + description: | + [selective-alignment mode only] + The value given to a mis-match between read and reference nucleotides in an alignment. + required: false + example: -4 + - name: --go + type: integer + description: | + [selective-alignment mode only] + The value given to a gap opening in an alignment. + required: false + example: 6 + - name: --ge + type: integer + description: | + [selective-alignment mode only] + The value given to a gap extension in an alignment. + required: false + example: 2 + - name: --bandwidth + type: integer + description: | + [selective-alignment mode only] + The value used for the bandwidth passed to ksw2. A smaller bandwidth can make the alignment verification run more quickly, but could possibly miss valid alignments. + required: false + example: 15 + - name: --allow_dovetail + type: boolean_true + description: | + [selective-alignment mode only] + Allow dovetailing mappings. + - name: --recover_orphans + type: boolean_true + description: | + [selective-alignment mode only] + Attempt to recover the mates of orphaned reads. This uses edlib for orphan recovery, and so introduces some computational overhead, but it can improve sensitivity. + - name: --mimicBT2 + type: boolean_true + description: | + [selective-alignment mode only] + Set flags to mimic parameters similar to Bowtie2 with --no-discordant and --no-mixed flags. This increases disallows dovetailing reads, and discards orphans. Note, this does not impose the very strict parameters assumed by RSEM+Bowtie2, like gapless alignments. For that behavior, use the --mimic_strictBT2 flag below. + - name: --mimic_strictBT2 + type: boolean_true + description: | + [selective-alignment mode only] + Set flags to mimic the very strict parameters used by RSEM+Bowtie2. This increases --min_score_fraction to 0.8, disallows dovetailing reads, discards orphans, and disallows gaps in alignments. + - name: --softclip + type: boolean_true + description: | + [selective-alignment mode only] + Allos soft-clipping of reads during selective-alignment. If this option is provided, then regions at the beginning or end of the read can be withheld from alignment without any effect on the resulting score (i.e. neither adding nor removing from the score). This will drastically reduce the penalty if there are mismatches at the beginning or end of the read due to e.g. low-quality bases or adapters. NOTE: Even with soft-clipping enabled, the read must still achieve a score of at least min_score_fraction * maximum achievable score, where the maximum achievable score is computed based on the full (un-clipped) read length. + - name: --softclip_overhangs + type: boolean_true + description: | + [selective-alignment mode only] + Allow soft-clipping of reads that overhang the beginning or ends of the transcript. In this case, the overhaning section of the read will simply be unaligned, and will not contribute or detract from the alignment score. The default policy is to force an end-to-end alignment of the entire read, so that overhanings will result in some deletion of nucleotides from the read. + - name: --full_length_alignment + type: boolean_true + description: | + [selective-alignment mode only] + Perform selective alignment over the full length of the read, beginning from the (approximate) initial mapping location and using extension alignment. This is in contrast with the default behavior which is to only perform alignment between the MEMs in the optimal chain (and before the first and after the last MEM if applicable). The default strategy forces the MEMs to belong to the alignment, but has the benefit that it can discover indels prior to the first hit shared between the read and reference. Except in very rare circumstances, the default mode should be more accurate. + - name: --hard_filter + type: boolean_true + description: | + [selective-alignment mode only] + Instead of weighting mappings by their alignment score, this flag will discard any mappings with sub-optimal alignment score. The default option of soft-filtering (i.e. weighting mappings by their alignment score) usually yields slightly more accurate abundance estimates but this flag may be desirable if you want more accurate 'naive' equivalence classes, rather than range factorized equivalence classes. + - name: --min_aln_prob + type: double + description: | + The minimum number of fragments that must be assigned to the transcriptome for quantification to proceed. + example: 0.00001 + - name: --write_mappings + alternatives: ["-z"] + type: file + direction: output + description: | + If this option is provided, then the selective-alignment results will be written out in SAM-compatible format. By default, output will be directed to stdout, but an alternative file name can be provided instead. + required: false + example: mappings.sam + - name: --write_qualities + type: boolean_true + description: | + This flag only has meaning if mappings are being written (with --write_mappings/-z). If this flag is provided, then the output SAM file will contain quality strings as well as read sequences. Note that this can greatly increase the size of the output file. + - name: --hit_filter_policy + type: string + description: | + [selective-alignment mode only] + Determines the policy by which hits are filtered in selective alignment. Filtering hits after chaining (the default) is more sensitive, but more computationally intensive, because it performs the chaining dynamic program for all hits. Filtering before chaining is faster, but some true hits may be missed. The options are BEFORE, AFTER, BOTH and NONE. + required: false + choices: [BEFORE, AFTER, BOTH, NONE] + example: AFTER + + - name: Advance options + arguments: + - name: --alternative_init_mode + type: boolean_true + description: | + Use an alternative strategy (rather than simple interpolation between) the online and uniform abundance estimates to initialize the EM / VBEM algorithm. + - name: --aux_dir + type: file + direction: output + description: | + The sub-directory of the quantification directory where auxiliary information e.g. bootstraps, bias parameters, etc. will be written. + required: false + example: aux_info + - name: --skip_quant + type: boolean_true + description: | + Skip performing the actual transcript quantification (including any Gibbs sampling or bootstrapping). + - name: --dump_eq + type: boolean_true + description: | + Dump the simple equivalence class counts that were computed during mapping or alignment. + - name: --dump_eq_weights + alternatives: ["-d"] + type: boolean_true + description: | + Dump conditional probabilities associated with transcripts when equivalence class information is being dumped to file. Note, this will dump the factorization that is actually used by salmon's offline phase for inference. If you are using range-factorized equivalence classes (the default) then the same transcript set may appear multiple times with different associated conditional probabilities. + - name: --min_assigned_frags + type: integer + description: | + The minimum number of fragments that must be assigned to the transcriptome for quantification to proceed. + required: false + example: 10 + - name: --reduce_GC_memory + type: boolean_true + description: | + If this option is selected, a more memory efficient (but slightly slower) representation is used to compute fragment GC content. Enabling this will reduce memory usage, but can also reduce speed. However, the results themselves will remain the same. + - name: --bias_speed_samp + type: integer + description: | + The value at which the fragment length PMF is down-sampled when evaluating sequence-specific & GC fragment bias. Larger values speed up effective length correction, but may decrease the fidelity of bias modeling results. + required: false + example: 5 + - name: --fld_max + type: integer + description: | + The maximum fragment length to consider when building the empirical distribution + required: false + example: 1000 + - name: --fld_mean + type: integer + description: | + The mean used in the fragment length distribution prior + required: false + example: 250 + - name: --fld_SD + type: integer + description: | + The standard deviation used in the fragment length distribution prior + required: false + example: 25 + - name: --forgetting_factor + alternatives: ["-f"] + type: double + description: | + The forgetting factor used in the online learning schedule. A smallervalue results in quicker learning, but higher variance and may be unstable. A larger value results in slower learning but may be more stable. Value should be in the interval (0.5, 1.0]. + required: false + min: 0.500000001 + max: 1 + example: 0.65 + - name: --init_uniform + type: boolean_true + description: | + Initialize the offline inference with uniform parameters, rather than seeding with online parameters. + - name: --max_occs_per_hit + type: integer + description: | + When collecting "hits" (MEMs), hits having more than max_occs_per_hit occurrences won't be considered. + required: false + example: 1000 + - name: --max_read_occ + type: integer + description: | + Reads "mapping" to more than this many places won't be considered. + required: false + example: 200 + - name: --no_length_correction + type: boolean_true + description: | + Entirely disables length correction when estimating the abundance of transcripts. This option can be used with protocols where one expects that fragments derive from their underlying targets without regard to that target's length (e.g. QuantSeq) + - name: --no_effective_length_correction + type: boolean_true + description: | + Disables effective length correction when computing the probability that a fragment was generated from a transcript. If this flag is passed in,the fragment length distribution is not taken into account when computing this probability. + - name: --no_single_frag_prob + type: boolean_true + description: | + Disables the estimation of an associated fragment length probability for single-end reads or for orphaned mappings in paired-end libraries. The default behavior is to consider the probability of all possible fragment lengths associated with the retained mapping. Enabling this flag (i.e. turning this default behavior off) will simply not attempt to estimate a fragment length probability in such cases. + - name: --no_frag_length_dist + type: boolean_true + description: | + Don't consider concordance with the learned fragment length distribution when trying to determine the probability that a fragment has originated from a specified location. Normally, Fragments with unlikely lengths will be assigned a smaller relative probability than those with more likely lengths. When this flag is passed in, the observed fragment length has no effect on that fragment's a priori probability. + - name: --no_bias_length_threshold + type: boolean_true + description: | + If this option is enabled, then no (lower) threshold will be set on how short bias correction can make effective lengths. This can increase the precision of bias correction, but harm robustness. The default correction applies a threshold. + - name: --num_bias_samples + type: integer + description: | + Number of fragment mappings to use when learning the sequence-specific bias model. + required: false + example: 2000000 + - name: --num_aux_model_samples + type: integer + description: | + The first are used to train the auxiliary model parameters (e.g. fragment length distribution, bias, etc.). After ther first observations the auxiliary model parameters will be assumed to have converged and will be fixed. + required: false + example: 5000000 + - name: --num_pre_aux_model_samples + type: integer + description: | + The first will have their assignment likelihoods and contributions to the transcript abundances computed without applying any auxiliary models. The purpose of ignoring the auxiliary models for the first observations is to avoid applying these models before their parameters have been learned sufficiently well. + required: false + example: 5000 + - name: --useEM + type: boolean_true + description: | + Use the traditional EM algorithm for optimization in the batch passes. + - name: --useVBOpt + type: boolean_true + description: | + Use the Variational Bayesian EM [default] + - name: --range_factorization_bins + type: integer + description: | + Factorizes the likelihood used in quantification by adopting a new notion of equivalence classes based on the conditional probabilities with which fragments are generated from different transcripts. This is a more fine-grained factorization than the normal rich equivalence classes. The default value (4) corresponds to the default used in Zakeri et al. 2017 (doi: 10.1093/bioinformatics/btx262), and larger values imply a more fine-grained factorization. If range factorization is enabled, a common value to select for this parameter is 4. A value of 0 signifies the use of basic rich equivalence classes. + required: false + example: 4 + - name: --num_Gibbs_samples + type: integer + description: | + Number of Gibbs sampling rounds to perform. + required: false + example: 0 + - name: --no_Gamma_draw + type: boolean_true + description: | + This switch will disable drawing transcript fractions from a Gamma distribution during Gibbs sampling. In this case the sampler does not account for shot-noise, but only assignment ambiguity + - name: --num_bootstraps + type: integer + description: | + Number of bootstrap samples to generate. Note: This is mutually exclusive with Gibbs sampling. + required: false + example: 0 + - name: --bootstrap_reproject + type: boolean_true + description: | + This switch will learn the parameter distribution from the bootstrapped counts for each sample, but will reproject those parameters onto the original equivalence class counts. + - name: --thinning_factor + type: integer + description: | + Number of steps to discard for every sample kept from the Gibbs chain. The larger this number, the less chance that subsequent samples are auto-correlated, but the slower sampling becomes. + required: false + example: 16 + - name: --quiet + alternatives: ["-q"] + type: boolean_true + description: | + Be quiet while doing quantification (don't write informative output to the console unless something goes wrong). + - name: --per_transcript_prior + type: boolean_true + description: | + The prior (either the default or the argument provided via --vb_prior) will be interpreted as a transcript-level prior (i.e. each transcript will be given a prior read count of this value) + - name: --per_nucleotide_prior + type: boolean_true + description: | + The prior (either the default or the argument provided via --vb_prior) will be interpreted as a nucleotide-level prior (i.e. each nucleotide will be given a prior read count of this value) + - name: --sig_digits + type: integer + description: | + The number of significant digits to write when outputting the EffectiveLength and NumReads columns + required: false + example: 3 + - name: --vb_prior + type: double + description: | + The prior that will be used in the VBEM algorithm. This is interpreted as a per-transcript prior, unless the --per_nucleotide_prior flag is also given. If the --per_nucleotide_prior flag is given, this is used as a nucleotide-level prior. If the default is used, it will be divided by 1000 before being used as a nucleotide-level prior, i.e. the default per-nucleotide prior will be 1e-5. + required: false + example: 0.01 + - name: --write_orphan_links + type: boolean_true + description: | + Write the transcripts that are linked by orphaned reads. + - name: --write_unmapped_names + type: boolean_true + description: | + Write the names of un-mapped reads to the file unmapped_names.txt in the auxiliary directory. + + - name: Alignment-specific options + arguments: + - name: --no_error_model + type: boolean_true + description: | + Turn off the alignment error model, which takes into account the the observed frequency of different types of mismatches / indels when computing the likelihood of a given alignment. Turning this off can speed up alignment-based salmon, but can harm quantification accuracy. + - name: --num_error_bins + type: integer + description: | + The number of bins into which to divide each read when learning and applying the error model. For example, a value of 10 would mean that effectively, a separate error model is leared and applied to each 10th of the read, while a value of 3 would mean that a separate error model is applied to the read beginning (first third), middle (second third) and end (final third). + required: false + example: 6 + - name: --sample_out + alternatives: ["-s"] + type: boolean_true + description: | + Write a "postSample.bam" file in the output directory that will sample the input alignments according to the estimated transcript abundances. If you're going to perform downstream analysis of the alignments with tools which don't, themselves, take fragment assignment ambiguity into account, you should use this output. + - name: --sample_unaligned + alternatives: ["-u"] + type: boolean_true + description: | + In addition to sampling the aligned reads, also write the un-aligned reads to "postSample.bam". + - name: --gencode + type: boolean_true + description: | + This flag will expect the input transcript fasta to be in GENCODE format, and will split the transcript name at the first '|' character. These reduced names will be used in the output and when looking for these transcripts in a gene to transcript GTF. + - name: --mapping_cache_memory_limit + type: integer + description: | + If the file contained fewer than this many mapped reads, then just keep the data in memory for subsequent rounds of inference. Obviously, this value should not be too large if you wish to keep a low memory usage, but setting it large enough to accommodate all of the mapped read can substantially speed up inference on "small" files that contain only a few million reads. + required: false + example: 2000000 + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: quay.io/biocontainers/salmon:1.10.2--hecfa306_0 + setup: + - type: docker + run: | + salmon index -v 2>&1 | sed 's/salmon \([0-9.]*\)/salmon: \1/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/salmon/salmon_quant/help.txt b/src/salmon/salmon_quant/help.txt new file mode 100644 index 00000000..bcd92656 --- /dev/null +++ b/src/salmon/salmon_quant/help.txt @@ -0,0 +1,976 @@ +```bash +salmon quant -h +``` +salmon v1.10.2 +=============== + +salmon quant has two modes --- one quantifies expression using raw reads +and the other makes use of already-aligned reads (in BAM/SAM format). +Which algorithm is used depends on the arguments passed to salmon quant. +If you provide salmon with alignments '-a [ --alignments ]' then the +alignment-based algorithm will be used, otherwise the algorithm for +quantifying from raw reads will be used. + +to view the help for salmon's selective-alignment-based mode, use the command + +salmon quant --help-reads + +To view the help for salmon's alignment-based mode, use the command + +salmon quant --help-alignment + + +```bash +salmon quant --help-reads +``` +Quant +========== +Perform dual-phase, selective-alignment-based estimation of +transcript abundance from RNA-seq reads + +salmon quant options: + + +mapping input options: + -l [ --libType ] arg Format string describing the library + type + -i [ --index ] arg salmon index + -r [ --unmatedReads ] arg List of files containing unmated reads + of (e.g. single-end reads) + -1 [ --mates1 ] arg File containing the #1 mates + -2 [ --mates2 ] arg File containing the #2 mates + + +basic options: + -v [ --version ] print version string + -h [ --help ] produce help message + -o [ --output ] arg Output quantification directory. + --seqBias Perform sequence-specific bias + correction. + --gcBias [beta for single-end reads] Perform + fragment GC bias correction. + --posBias Perform positional bias correction. + -p [ --threads ] arg (=16) The number of threads to use + concurrently. + --incompatPrior arg (=0) This option sets the prior probability + that an alignment that disagrees with + the specified library type (--libType) + results from the true fragment origin. + Setting this to 0 specifies that + alignments that disagree with the + library type should be "impossible", + while setting it to 1 says that + alignments that disagree with the + library type are no less likely than + those that do + -g [ --geneMap ] arg File containing a mapping of + transcripts to genes. If this file is + provided salmon will output both + quant.sf and quant.genes.sf files, + where the latter contains aggregated + gene-level abundance estimates. The + transcript to gene mapping should be + provided as either a GTF file, or a in + a simple tab-delimited format where + each line contains the name of a + transcript and the gene to which it + belongs separated by a tab. The + extension of the file is used to + determine how the file should be + parsed. Files ending in '.gtf', '.gff' + or '.gff3' are assumed to be in GTF + format; files with any other extension + are assumed to be in the simple format. + In GTF / GFF format, the + "transcript_id" is assumed to contain + the transcript identifier and the + "gene_id" is assumed to contain the + corresponding gene identifier. + --auxTargetFile arg A file containing a list of "auxiliary" + targets. These are valid targets + (i.e., not decoys) to which fragments + are allowed to map and be assigned, and + which will be quantified, but for which + auxiliary models like sequence-specific + and fragment-GC bias correction should + not be applied. + --meta If you're using Salmon on a metagenomic + dataset, consider setting this flag to + disable parts of the abundance + estimation model that make less sense + for metagenomic data. + + +options specific to mapping mode: + --discardOrphansQuasi [selective-alignment mode only] : + Discard orphan mappings in + selective-alignment mode. If this flag + is passed then only paired mappings + will be considered toward + quantification estimates. The default + behavior is to consider orphan mappings + if no valid paired mappings exist. + This flag is independent of the option + to write the orphaned mappings to file + (--writeOrphanLinks). + --validateMappings [*deprecated* (no effect; + selective-alignment is the default)] + --consensusSlack arg (=0.349999994) [selective-alignment mode only] : The + amount of slack allowed in the + selective-alignment filtering + mechanism. If this is set to a + fraction, X, greater than 0 (and in + [0,1)), then uniMEM chains with scores + below (100 * X)% of the best chain + score for a read, and read pairs with a + sum of chain scores below (100 * X)% of + the best chain score for a read pair + will be discounted as a mapping + candidates. The default value of this + option is 0.35. + --preMergeChainSubThresh arg (=0.75) [selective-alignment mode only] : The + threshold of sub-optimal chains, + compared to the best chain on a given + target, that will be retained and + passed to the next phase of mapping. + Specifically, if the best chain for a + read (or read-end in paired-end mode) + to target t has score X_t, then all + chains for this read with score >= X_t + * preMergeChainSubThresh will be + retained and passed to subsequent + mapping phases. This value must be in + the range [0, 1]. + --postMergeChainSubThresh arg (=0.90000000000000002) + [selective-alignment mode only] : The + threshold of sub-optimal chain pairs, + compared to the best chain pair on a + given target, that will be retained and + passed to the next phase of mapping. + This is different than + preMergeChainSubThresh, because this is + applied to pairs of chains (from the + ends of paired-end reads) after merging + (i.e. after checking concordancy + constraints etc.). Specifically, if + the best chain pair to target t has + score X_t, then all chain pairs for + this read pair with score >= X_t * + postMergeChainSubThresh will be + retained and passed to subsequent + mapping phases. This value must be in + the range [0, 1]. Note: This option is + only meaningful for paired-end + libraries, and is ignored for + single-end libraries. + --orphanChainSubThresh arg (=0.94999999999999996) + [selective-alignment mode only] : This + threshold sets a global sub-optimality + threshold for chains corresponding to + orphan mappings. That is, if the + merging procedure results in no + concordant mappings then only orphan + mappings with a chain score >= + orphanChainSubThresh * bestChainScore + will be retained and passed to + subsequent mapping phases. This value + must be in the range [0, 1]. Note: This + option is only meaningful for + paired-end libraries, and is ignored + for single-end libraries. + --scoreExp arg (=1) [selective-alignment mode only] : The + factor by which sub-optimal alignment + scores are downweighted to produce a + probability. If the best alignment + score for the current read is S, and + the score for a particular alignment is + w, then the probability will be + computed porportional to exp( - + scoreExp * (S-w) ). + --minScoreFraction arg [selective-alignment mode only] : The + fraction of the optimal possible + alignment score that a mapping must + achieve in order to be considered + "valid" --- should be in (0,1]. + Salmon Default 0.65 and Alevin Default + 0.87 + --mismatchSeedSkip arg (=3) [selective-alignment mode only] : After + a k-mer hit is extended to a uni-MEM, + the uni-MEM extension can terminate for + one of 3 reasons; the end of the read, + the end of the unitig, or a mismatch. + If the extension ends because of a + mismatch, this is likely the result of + a sequencing error. To avoid looking + up many k-mers that will likely fail to + be located in the index, the search + procedure skips by a factor of + mismatchSeedSkip until it either (1) + finds another match or (2) is k-bases + past the mismatch position. This value + controls that skip length. A smaller + value can increase sensitivity, while a + larger value can speed up seeding. + --disableChainingHeuristic [selective-alignment mode only] : By + default, the heuristic of (Li 2018) is + implemented, which terminates the + chaining DP once a given number of + valid backpointers are found. This + speeds up the seed (MEM) chaining step, + but may result in sub-optimal chains in + complex situations (e.g. sequences with + many repeats and overlapping repeats). + Passing this flag will disable the + chaining heuristic, and perform the + full chaining dynamic program, + guaranteeing the optimal chain is found + in this step. + --decoyThreshold arg (=1) [selective-alignment mode only] : For + an alignemnt to an annotated transcript + to be considered invalid, it must have + an alignment score < (decoyThreshold * + bestDecoyScore). A value of 1.0 means + that any alignment strictly worse than + the best decoy alignment will be + discarded. A smaller value will allow + reads to be allocated to transcripts + even if they strictly align better to + the decoy sequence. + --ma arg (=2) [selective-alignment mode only] : The + value given to a match between read and + reference nucleotides in an alignment. + --mp arg (=-4) [selective-alignment mode only] : The + value given to a mis-match between read + and reference nucleotides in an + alignment. + --go arg (=6) [selective-alignment mode only] : The + value given to a gap opening in an + alignment. + --ge arg (=2) [selective-alignment mode only] : The + value given to a gap extension in an + alignment. + --bandwidth arg (=15) [selective-alignment mode only] : The + value used for the bandwidth passed to + ksw2. A smaller bandwidth can make the + alignment verification run more + quickly, but could possibly miss valid + alignments. + --allowDovetail [selective-alignment mode only] : allow + dovetailing mappings. + --recoverOrphans [selective-alignment mode only] : + Attempt to recover the mates of + orphaned reads. This uses edlib for + orphan recovery, and so introduces some + computational overhead, but it can + improve sensitivity. + --mimicBT2 [selective-alignment mode only] : Set + flags to mimic parameters similar to + Bowtie2 with --no-discordant and + --no-mixed flags. This increases + disallows dovetailing reads, and + discards orphans. Note, this does not + impose the very strict parameters + assumed by RSEM+Bowtie2, like gapless + alignments. For that behavior, use the + --mimiStrictBT2 flag below. + --mimicStrictBT2 [selective-alignment mode only] : Set + flags to mimic the very strict + parameters used by RSEM+Bowtie2. This + increases --minScoreFraction to 0.8, + disallows dovetailing reads, discards + orphans, and disallows gaps in + alignments. + --softclip [selective-alignment mode only + (experimental)] : Allos soft-clipping + of reads during selective-alignment. If + this option is provided, then regions + at the beginning or end of the read can + be withheld from alignment without any + effect on the resulting score (i.e. + neither adding nor removing from the + score). This will drastically reduce + the penalty if there are mismatches at + the beginning or end of the read due to + e.g. low-quality bases or adapters. + NOTE: Even with soft-clipping enabled, + the read must still achieve a score of + at least minScoreFraction * maximum + achievable score, where the maximum + achievable score is computed based on + the full (un-clipped) read length. + --softclipOverhangs [selective-alignment mode only] : Allow + soft-clipping of reads that overhang + the beginning or ends of the + transcript. In this case, the + overhaning section of the read will + simply be unaligned, and will not + contribute or detract from the + alignment score. The default policy is + to force an end-to-end alignment of the + entire read, so that overhanings will + result in some deletion of nucleotides + from the read. + --fullLengthAlignment [selective-alignment mode only] : + Perform selective alignment over the + full length of the read, beginning from + the (approximate) initial mapping + location and using extension alignment. + This is in contrast with the default + behavior which is to only perform + alignment between the MEMs in the + optimal chain (and before the first and + after the last MEM if applicable). The + default strategy forces the MEMs to + belong to the alignment, but has the + benefit that it can discover indels + prior to the first hit shared between + the read and reference. Except in very + rare circumstances, the default mode + should be more accurate. + --hardFilter [selective-alignemnt mode only] : + Instead of weighting mappings by their + alignment score, this flag will discard + any mappings with sub-optimal alignment + score. The default option of + soft-filtering (i.e. weighting mappings + by their alignment score) usually + yields slightly more accurate abundance + estimates but this flag may be + desirable if you want more accurate + 'naive' equivalence classes, rather + than range factorized equivalence + classes. + --minAlnProb arg (=1.0000000000000001e-05) + [selective-alignment mode only] : Any + mapping whose alignment probability (as + computed by P(aln) = exp(-scoreExp * + difference from best mapping score) is + less than minAlnProb will not be + considered as a valid alignment for + this read. The goal of this flag is to + remove very low probability alignments + that are unlikely to have any + non-trivial effect on the final + quantifications. Filtering such + alignments reduces the number of + variables that need to be considered + and can result in slightly faster + inference and 'cleaner' equivalence + classes. + -z [ --writeMappings ] [=arg(=-)] If this option is provided, then the + selective-alignment results will be + written out in SAM-compatible format. + By default, output will be directed to + stdout, but an alternative file name + can be provided instead. + --writeQualities This flag only has meaning if mappings + are being written (with + --writeMappings/-z). If this flag is + provided, then the output SAM file will + contain quality strings as well as read + sequences. Note that this can greatly + increase the size of the output file. + --hitFilterPolicy arg (=AFTER) [selective-alignment mode only] : + Determines the policy by which hits are + filtered in selective alignment. + Filtering hits after chaining (the + default) is more sensitive, but more + computationally intensive, because it + performs the chaining dynamic program + for all hits. Filtering before + chaining is faster, but some true hits + may be missed. The options are BEFORE, + AFTER, BOTH and NONE. + + +advanced options: + --alternativeInitMode [Experimental]: Use an alternative + strategy (rather than simple + interpolation between) the online and + uniform abundance estimates to + initialize the EM / VBEM algorithm. + --auxDir arg (=aux_info) The sub-directory of the quantification + directory where auxiliary information + e.g. bootstraps, bias parameters, etc. + will be written. + --skipQuant Skip performing the actual transcript + quantification (including any Gibbs + sampling or bootstrapping). + --dumpEq Dump the simple equivalence class + counts that were computed during + mapping or alignment. + -d [ --dumpEqWeights ] Dump conditional probabilities + associated with transcripts when + equivalence class information is being + dumped to file. Note, this will dump + the factorization that is actually used + by salmon's offline phase for + inference. If you are using + range-factorized equivalence classes + (the default) then the same transcript + set may appear multiple times with + different associated conditional + probabilities. + --minAssignedFrags arg (=10) The minimum number of fragments that + must be assigned to the transcriptome + for quantification to proceed. + --reduceGCMemory If this option is selected, a more + memory efficient (but slightly slower) + representation is used to compute + fragment GC content. Enabling this will + reduce memory usage, but can also + reduce speed. However, the results + themselves will remain the same. + --biasSpeedSamp arg (=5) The value at which the fragment length + PMF is down-sampled when evaluating + sequence-specific & GC fragment bias. + Larger values speed up effective length + correction, but may decrease the + fidelity of bias modeling results. + --fldMax arg (=1000) The maximum fragment length to consider + when building the empirical + distribution + --fldMean arg (=250) The mean used in the fragment length + distribution prior + --fldSD arg (=25) The standard deviation used in the + fragment length distribution prior + -f [ --forgettingFactor ] arg (=0.65000000000000002) + The forgetting factor used in the + online learning schedule. A smaller + value results in quicker learning, but + higher variance and may be unstable. A + larger value results in slower learning + but may be more stable. Value should + be in the interval (0.5, 1.0]. + --initUniform initialize the offline inference with + uniform parameters, rather than seeding + with online parameters. + --maxOccsPerHit arg (=1000) When collecting "hits" (MEMs), hits + having more than maxOccsPerHit + occurrences won't be considered. + -w [ --maxReadOcc ] arg (=200) Reads "mapping" to more than this many + places won't be considered. + --maxRecoverReadOcc arg (=2500) Relevant for alevin with '--sketch' + mode only: if a read has valid seed + matches, but no read has matches + leading to fewer than "maxReadOcc" + mappings, then try to recover mappings + for this read as long as there are + fewer than "maxRecoverReadOcc" + mappings. + --noLengthCorrection [experimental] : Entirely disables + length correction when estimating the + abundance of transcripts. This option + can be used with protocols where one + expects that fragments derive from + their underlying targets without regard + to that target's length (e.g. QuantSeq) + --noEffectiveLengthCorrection Disables effective length correction + when computing the probability that a + fragment was generated from a + transcript. If this flag is passed in, + the fragment length distribution is not + taken into account when computing this + probability. + --noSingleFragProb Disables the estimation of an + associated fragment length probability + for single-end reads or for orphaned + mappings in paired-end libraries. The + default behavior is to consider the + probability of all possible fragment + lengths associated with the retained + mapping. Enabling this flag (i.e. + turning this default behavior off) will + simply not attempt to estimate a + fragment length probability in such + cases. + --noFragLengthDist [experimental] : Don't consider + concordance with the learned fragment + length distribution when trying to + determine the probability that a + fragment has originated from a + specified location. Normally, + Fragments with unlikely lengths will be + assigned a smaller relative probability + than those with more likely lengths. + When this flag is passed in, the + observed fragment length has no effect + on that fragment's a priori + probability. + --noBiasLengthThreshold [experimental] : If this option is + enabled, then no (lower) threshold will + be set on how short bias correction can + make effective lengths. This can + increase the precision of bias + correction, but harm robustness. The + default correction applies a threshold. + --numBiasSamples arg (=2000000) Number of fragment mappings to use when + learning the sequence-specific bias + model. + --numAuxModelSamples arg (=5000000) The first are used + to train the auxiliary model parameters + (e.g. fragment length distribution, + bias, etc.). After ther first + observations the + auxiliary model parameters will be + assumed to have converged and will be + fixed. + --numPreAuxModelSamples arg (=5000) The first will + have their assignment likelihoods and + contributions to the transcript + abundances computed without applying + any auxiliary models. The purpose of + ignoring the auxiliary models for the + first + observations is to avoid applying these + models before their parameters have + been learned sufficiently well. + --useEM Use the traditional EM algorithm for + optimization in the batch passes. + --useVBOpt Use the Variational Bayesian EM + [default] + --rangeFactorizationBins arg (=4) Factorizes the likelihood used in + quantification by adopting a new notion + of equivalence classes based on the + conditional probabilities with which + fragments are generated from different + transcripts. This is a more + fine-grained factorization than the + normal rich equivalence classes. The + default value (4) corresponds to the + default used in Zakeri et al. 2017 + (doi: 10.1093/bioinformatics/btx262), + and larger values imply a more + fine-grained factorization. If range + factorization is enabled, a common + value to select for this parameter is + 4. A value of 0 signifies the use of + basic rich equivalence classes. + --numGibbsSamples arg (=0) Number of Gibbs sampling rounds to + perform. + --noGammaDraw This switch will disable drawing + transcript fractions from a Gamma + distribution during Gibbs sampling. In + this case the sampler does not account + for shot-noise, but only assignment + ambiguity + --numBootstraps arg (=0) Number of bootstrap samples to + generate. Note: This is mutually + exclusive with Gibbs sampling. + --bootstrapReproject This switch will learn the parameter + distribution from the bootstrapped + counts for each sample, but will + reproject those parameters onto the + original equivalence class counts. + --thinningFactor arg (=16) Number of steps to discard for every + sample kept from the Gibbs chain. The + larger this number, the less chance + that subsequent samples are + auto-correlated, but the slower + sampling becomes. + -q [ --quiet ] Be quiet while doing quantification + (don't write informative output to the + console unless something goes wrong). + --perTranscriptPrior The prior (either the default or the + argument provided via --vbPrior) will + be interpreted as a transcript-level + prior (i.e. each transcript will be + given a prior read count of this value) + --perNucleotidePrior The prior (either the default or the + argument provided via --vbPrior) will + be interpreted as a nucleotide-level + prior (i.e. each nucleotide will be + given a prior read count of this value) + --sigDigits arg (=3) The number of significant digits to + write when outputting the + EffectiveLength and NumReads columns + --vbPrior arg (=0.01) The prior that will be used in the VBEM + algorithm. This is interpreted as a + per-transcript prior, unless the + --perNucleotidePrior flag is also + given. If the --perNucleotidePrior + flag is given, this is used as a + nucleotide-level prior. If the default + is used, it will be divided by 1000 + before being used as a nucleotide-level + prior, i.e. the default per-nucleotide + prior will be 1e-5. + --writeOrphanLinks Write the transcripts that are linked + by orphaned reads. + --writeUnmappedNames Write the names of un-mapped reads to + the file unmapped_names.txt in the + auxiliary directory. + + +```bash +salmon quant --help-alignment +``` +Quant +========== +Perform dual-phase, alignment-based estimation of +transcript abundance from RNA-seq reads + +salmon quant options: + + +alignment input options: + --discardOrphans [alignment-based mode only] : Discard + orphan alignments in the input . If + this flag is passed, then only paired + alignments will be considered toward + quantification estimates. The default + behavior is to consider orphan + alignments if no valid paired mappings + exist. + -l [ --libType ] arg Format string describing the library + type + -a [ --alignments ] arg input alignment (BAM) file(s). + -e [ --eqclasses ] arg input salmon weighted equivalence class + file. + -t [ --targets ] arg FASTA format file containing target + transcripts. + --ont use alignment model for Oxford Nanopore + long reads + + +basic options: + -v [ --version ] print version string + -h [ --help ] produce help message + -o [ --output ] arg Output quantification directory. + --seqBias Perform sequence-specific bias + correction. + --gcBias [beta for single-end reads] Perform + fragment GC bias correction. + --posBias Perform positional bias correction. + -p [ --threads ] arg (=8) The number of threads to use + concurrently. + --incompatPrior arg (=0) This option sets the prior probability + that an alignment that disagrees with + the specified library type (--libType) + results from the true fragment origin. + Setting this to 0 specifies that + alignments that disagree with the + library type should be "impossible", + while setting it to 1 says that + alignments that disagree with the + library type are no less likely than + those that do + -g [ --geneMap ] arg File containing a mapping of + transcripts to genes. If this file is + provided salmon will output both + quant.sf and quant.genes.sf files, + where the latter contains aggregated + gene-level abundance estimates. The + transcript to gene mapping should be + provided as either a GTF file, or a in + a simple tab-delimited format where + each line contains the name of a + transcript and the gene to which it + belongs separated by a tab. The + extension of the file is used to + determine how the file should be + parsed. Files ending in '.gtf', '.gff' + or '.gff3' are assumed to be in GTF + format; files with any other extension + are assumed to be in the simple format. + In GTF / GFF format, the + "transcript_id" is assumed to contain + the transcript identifier and the + "gene_id" is assumed to contain the + corresponding gene identifier. + --auxTargetFile arg A file containing a list of "auxiliary" + targets. These are valid targets + (i.e., not decoys) to which fragments + are allowed to map and be assigned, and + which will be quantified, but for which + auxiliary models like sequence-specific + and fragment-GC bias correction should + not be applied. + --meta If you're using Salmon on a metagenomic + dataset, consider setting this flag to + disable parts of the abundance + estimation model that make less sense + for metagenomic data. + + +alignment-specific options: + --noErrorModel Turn off the alignment error model, + which takes into account the the + observed frequency of different types + of mismatches / indels when computing + the likelihood of a given alignment. + Turning this off can speed up + alignment-based salmon, but can harm + quantification accuracy. + --numErrorBins arg (=6) The number of bins into which to divide + each read when learning and applying + the error model. For example, a value + of 10 would mean that effectively, a + separate error model is leared and + applied to each 10th of the read, while + a value of 3 would mean that a separate + error model is applied to the read + beginning (first third), middle (second + third) and end (final third). + -s [ --sampleOut ] Write a "postSample.bam" file in the + output directory that will sample the + input alignments according to the + estimated transcript abundances. If + you're going to perform downstream + analysis of the alignments with tools + which don't, themselves, take fragment + assignment ambiguity into account, you + should use this output. + -u [ --sampleUnaligned ] In addition to sampling the aligned + reads, also write the un-aligned reads + to "postSample.bam". + --gencode This flag will expect the input + transcript fasta to be in GENCODE + format, and will split the transcript + name at the first '|' character. These + reduced names will be used in the + output and when looking for these + transcripts in a gene to transcript + GTF. + --scoreExp arg (=1) The factor by which sub-optimal + alignment scores are downweighted to + produce a probability. If the best + alignment score for the current read is + S, and the score for a particular + alignment is w, then the probability + will be computed porportional to exp( - + scoreExp * (S-w) ). NOTE: This flag + only has an effect if you are parsing + alignments produced by salmon itself + (i.e. pufferfish or RapMap in + selective-alignment mode). + --mappingCacheMemoryLimit arg (=2000000) + If the file contained fewer than this + many mapped reads, then just keep the + data in memory for subsequent rounds of + inference. Obviously, this value should + not be too large if you wish to keep a + low memory usage, but setting it large + enough to accommodate all of the mapped + read can substantially speed up + inference on "small" files that contain + only a few million reads. + + +advanced options: + --alternativeInitMode [Experimental]: Use an alternative + strategy (rather than simple + interpolation between) the online and + uniform abundance estimates to + initialize the EM / VBEM algorithm. + --auxDir arg (=aux_info) The sub-directory of the quantification + directory where auxiliary information + e.g. bootstraps, bias parameters, etc. + will be written. + --skipQuant Skip performing the actual transcript + quantification (including any Gibbs + sampling or bootstrapping). + --dumpEq Dump the simple equivalence class + counts that were computed during + mapping or alignment. + -d [ --dumpEqWeights ] Dump conditional probabilities + associated with transcripts when + equivalence class information is being + dumped to file. Note, this will dump + the factorization that is actually used + by salmon's offline phase for + inference. If you are using + range-factorized equivalence classes + (the default) then the same transcript + set may appear multiple times with + different associated conditional + probabilities. + --minAssignedFrags arg (=10) The minimum number of fragments that + must be assigned to the transcriptome + for quantification to proceed. + --reduceGCMemory If this option is selected, a more + memory efficient (but slightly slower) + representation is used to compute + fragment GC content. Enabling this will + reduce memory usage, but can also + reduce speed. However, the results + themselves will remain the same. + --biasSpeedSamp arg (=5) The value at which the fragment length + PMF is down-sampled when evaluating + sequence-specific & GC fragment bias. + Larger values speed up effective length + correction, but may decrease the + fidelity of bias modeling results. + --fldMax arg (=1000) The maximum fragment length to consider + when building the empirical + distribution + --fldMean arg (=250) The mean used in the fragment length + distribution prior + --fldSD arg (=25) The standard deviation used in the + fragment length distribution prior + -f [ --forgettingFactor ] arg (=0.65000000000000002) + The forgetting factor used in the + online learning schedule. A smaller + value results in quicker learning, but + higher variance and may be unstable. A + larger value results in slower learning + but may be more stable. Value should + be in the interval (0.5, 1.0]. + --initUniform initialize the offline inference with + uniform parameters, rather than seeding + with online parameters. + --maxOccsPerHit arg (=1000) When collecting "hits" (MEMs), hits + having more than maxOccsPerHit + occurrences won't be considered. + -w [ --maxReadOcc ] arg (=200) Reads "mapping" to more than this many + places won't be considered. + --maxRecoverReadOcc arg (=2500) Relevant for alevin with '--sketch' + mode only: if a read has valid seed + matches, but no read has matches + leading to fewer than "maxReadOcc" + mappings, then try to recover mappings + for this read as long as there are + fewer than "maxRecoverReadOcc" + mappings. + --noLengthCorrection [experimental] : Entirely disables + length correction when estimating the + abundance of transcripts. This option + can be used with protocols where one + expects that fragments derive from + their underlying targets without regard + to that target's length (e.g. QuantSeq) + --noEffectiveLengthCorrection Disables effective length correction + when computing the probability that a + fragment was generated from a + transcript. If this flag is passed in, + the fragment length distribution is not + taken into account when computing this + probability. + --noSingleFragProb Disables the estimation of an + associated fragment length probability + for single-end reads or for orphaned + mappings in paired-end libraries. The + default behavior is to consider the + probability of all possible fragment + lengths associated with the retained + mapping. Enabling this flag (i.e. + turning this default behavior off) will + simply not attempt to estimate a + fragment length probability in such + cases. + --noFragLengthDist [experimental] : Don't consider + concordance with the learned fragment + length distribution when trying to + determine the probability that a + fragment has originated from a + specified location. Normally, + Fragments with unlikely lengths will be + assigned a smaller relative probability + than those with more likely lengths. + When this flag is passed in, the + observed fragment length has no effect + on that fragment's a priori + probability. + --noBiasLengthThreshold [experimental] : If this option is + enabled, then no (lower) threshold will + be set on how short bias correction can + make effective lengths. This can + increase the precision of bias + correction, but harm robustness. The + default correction applies a threshold. + --numBiasSamples arg (=2000000) Number of fragment mappings to use when + learning the sequence-specific bias + model. + --numAuxModelSamples arg (=5000000) The first are used + to train the auxiliary model parameters + (e.g. fragment length distribution, + bias, etc.). After ther first + observations the + auxiliary model parameters will be + assumed to have converged and will be + fixed. + --numPreAuxModelSamples arg (=5000) The first will + have their assignment likelihoods and + contributions to the transcript + abundances computed without applying + any auxiliary models. The purpose of + ignoring the auxiliary models for the + first + observations is to avoid applying these + models before their parameters have + been learned sufficiently well. + --useEM Use the traditional EM algorithm for + optimization in the batch passes. + --useVBOpt Use the Variational Bayesian EM + [default] + --rangeFactorizationBins arg (=4) Factorizes the likelihood used in + quantification by adopting a new notion + of equivalence classes based on the + conditional probabilities with which + fragments are generated from different + transcripts. This is a more + fine-grained factorization than the + normal rich equivalence classes. The + default value (4) corresponds to the + default used in Zakeri et al. 2017 + (doi: 10.1093/bioinformatics/btx262), + and larger values imply a more + fine-grained factorization. If range + factorization is enabled, a common + value to select for this parameter is + 4. A value of 0 signifies the use of + basic rich equivalence classes. + --numGibbsSamples arg (=0) Number of Gibbs sampling rounds to + perform. + --noGammaDraw This switch will disable drawing + transcript fractions from a Gamma + distribution during Gibbs sampling. In + this case the sampler does not account + for shot-noise, but only assignment + ambiguity + --numBootstraps arg (=0) Number of bootstrap samples to + generate. Note: This is mutually + exclusive with Gibbs sampling. + --bootstrapReproject This switch will learn the parameter + distribution from the bootstrapped + counts for each sample, but will + reproject those parameters onto the + original equivalence class counts. + --thinningFactor arg (=16) Number of steps to discard for every + sample kept from the Gibbs chain. The + larger this number, the less chance + that subsequent samples are + auto-correlated, but the slower + sampling becomes. + -q [ --quiet ] Be quiet while doing quantification + (don't write informative output to the + console unless something goes wrong). + --perTranscriptPrior The prior (either the default or the + argument provided via --vbPrior) will + be interpreted as a transcript-level + prior (i.e. each transcript will be + given a prior read count of this value) + --perNucleotidePrior The prior (either the default or the + argument provided via --vbPrior) will + be interpreted as a nucleotide-level + prior (i.e. each nucleotide will be + given a prior read count of this value) + --sigDigits arg (=3) The number of significant digits to + write when outputting the + EffectiveLength and NumReads columns + --vbPrior arg (=0.01) The prior that will be used in the VBEM + algorithm. This is interpreted as a + per-transcript prior, unless the + --perNucleotidePrior flag is also + given. If the --perNucleotidePrior + flag is given, this is used as a + nucleotide-level prior. If the default + is used, it will be divided by 1000 + before being used as a nucleotide-level + prior, i.e. the default per-nucleotide + prior will be 1e-5. + --writeOrphanLinks Write the transcripts that are linked + by orphaned reads. + --writeUnmappedNames Write the names of un-mapped reads to + the file unmapped_names.txt in the + auxiliary directory. \ No newline at end of file diff --git a/src/salmon/salmon_quant/script.sh b/src/salmon/salmon_quant/script.sh new file mode 100644 index 00000000..ace79711 --- /dev/null +++ b/src/salmon/salmon_quant/script.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +set -e + +## VIASH START +## VIASH END + +[[ "$par_discard_orphans" == "false" ]] && unset par_discard_orphans +[[ "$par_ont" == "false" ]] && unset par_ont +[[ "$par_seq_bias" == "false" ]] && unset par_seq_bias +[[ "$par_gc_bias" == "false" ]] && unset par_gc_bias +[[ "$par_pos_bias" == "false" ]] && unset par_pos_bias +[[ "$par_meta" == "false" ]] && unset par_meta +[[ "$par_discard_orphans_quasi" == "false" ]] && unset par_discard_orphans_quasi +[[ "$par_disable_chaining_heuristic" == "false" ]] && unset par_disable_chaining_heuristic +[[ "$par_allow_dovetail" == "false" ]] && unset par_allow_dovetail +[[ "$par_recover_orphans" == "false" ]] && unset par_recover_orphans +[[ "$par_mimicBT2" == "false" ]] && unset par_mimicBT2 +[[ "$par_mimic_strictBT2" == "false" ]] && unset par_mimic_strictBT2 +[[ "$par_softclip" == "false" ]] && unset par_softclip +[[ "$par_softclip_overhangs" == "false" ]] && unset par_softclip_overhangs +[[ "$par_full_length_alignment" == "false" ]] && unset par_full_length_alignment +[[ "$par_hard_filter" == "false" ]] && unset par_hard_filter +[[ "$par_write_qualities" == "false" ]] && unset par_write_qualities +[[ "$par_alternative_init_mode" == "false" ]] && unset par_alternative_init_mode +[[ "$par_skip_quant" == "false" ]] && unset par_skip_quant +[[ "$par_dump_eq" == "false" ]] && unset par_dump_eq +[[ "$par_dump_eq_weights" == "false" ]] && unset par_dump_eq_weights +[[ "$par_reduce_GC_memory" == "false" ]] && unset par_reduce_GC_memory +[[ "$par_init_uniform" == "false" ]] && unset par_init_uniform +[[ "$par_no_length_correction" == "false" ]] && unset par_no_length_correction +[[ "$par_no_effective_length_correction" == "false" ]] && unset par_no_effective_length_correction +[[ "$par_no_single_frag_prob" == "false" ]] && unset par_no_single_frag_prob +[[ "$par_no_frag_length_dist" == "false" ]] && unset par_no_frag_length_dist +[[ "$par_no_bias_length_threshold" == "false" ]] && unset par_no_bias_length_threshold +[[ "$par_useEM" == "false" ]] && unset par_useEM +[[ "$par_useVBOpt" == "false" ]] && unset par_useVBOpt +[[ "$par_no_Gamma_draw" == "false" ]] && unset par_no_Gamma_draw +[[ "$par_bootstrap_reproject" == "false" ]] && unset par_bootstrap_reproject +[[ "$par_quiet" == "false" ]] && unset par_quiet +[[ "$par_per_transcript_prior" == "false" ]] && unset par_per_transcript_prior +[[ "$par_per_nucleotide_prior" == "false" ]] && unset par_per_nucleotide_prior +[[ "$par_write_orphan_links" == "false" ]] && unset par_write_orphan_links +[[ "$par_write_unmapped_names" == "false" ]] && unset par_write_unmapped_names +[[ "$par_no_error_model" == "false" ]] && unset par_no_error_model +[[ "$par_sample_out" == "false" ]] && unset par_sample_out +[[ "$par_sample_unaligned" == "false" ]] && unset par_sample_unaligned +[[ "$par_gencode" == "false" ]] && unset par_gencode + +IFS=";" read -ra unmated_reads <<< $par_unmated_reads +IFS=";" read -ra mates1 <<< $par_mates1 +IFS=";" read -ra mates2 <<< $par_mates2 +IFS=";" read -ra alignment <<< $par_alignments + +salmon quant \ + ${par_lib_type:+-l "${par_lib_type}"} \ + ${par_index:+-i "${par_index}"} \ + ${par_unmated_reads:+-r ${unmated_reads[*]}} \ + ${par_mates1:+-1 ${mates1[*]}} \ + ${par_mates2:+-2 ${mates2[*]}} \ + ${par_alignments:+-a ${alignment[*]}} \ + ${par_discard_orphans:+--discardOrphans} \ + ${par_eqclasses:+-e "${par_eqclasses}"} \ + ${par_targets:+-t "${par_targets}"} \ + ${par_ont:+--ont} \ + ${par_output:+-o "${par_output}"} \ + ${par_seq_bias:+--seqBias} \ + ${par_gc_bias:+--gcBias} \ + ${par_pos_bias:+--posBias} \ + ${meta_cpus:+-p "${meta_cpus}"} \ + ${par_incompat_prior:+--incompatPrior "${par_incompat_prior}"} \ + ${par_gene_map:+-g "${par_gene_map}"} \ + ${par_aux_target_file:+--auxTargetFile "${par_aux_target_file}"} \ + ${par_meta:+--meta} \ + ${par_score_exp:+--scoreExp "${par_score_exp}"} \ + ${par_discard_orphans_quasi:+--discardOrphansQuasi} \ + ${par_consensus_slack:+--consensusSlack "${par_consensus_slack}"} \ + ${par_pre_merge_chain_sub_thresh:+--preMergeChainSubThresh "${par_pre_merge_chain_sub_thresh}"} \ + ${par_post_merge_chain_sub_thresh:+--postMergeChainSubThresh "${par_post_merge_chain_sub_thresh}"} \ + ${par_orphan_chain_sub_thresh:+--orphanChainSubThresh "${par_orphan_chain_sub_thresh}"} \ + ${par_min_score_fraction:+--minScoreFraction "${par_min_score_fraction}"} \ + ${par_mismatch_seed_skip:+--mismatchSeedSkip "${par_mismatch_seed_skip}"} \ + ${par_disable_chaining_heuristic:+--disableChainingHeuristic} \ + ${par_decoy_threshold:+--decoyThreshold "${par_decoy_threshold}"} \ + ${par_ma:+--ma "${par_ma}"} \ + ${par_mp:+--mp "${par_mp}"} \ + ${par_go:+--go "${par_go}"} \ + ${par_ge:+--ge "${par_ge}"} \ + ${par_bandwidth:+--bandwidth "${par_bandwidth}"} \ + ${par_allow_dovetail:+--allowDovetail} \ + ${par_recover_orphans:+--recoverOrphans} \ + ${par_mimicBT2:+--mimicBT2} \ + ${par_mimic_strictBT2:+--mimicStrictBT2} \ + ${par_softclip:+--softclip} \ + ${par_softclip_overhangs:+--softclipOverhangs} \ + ${par_full_length_alignment:+--fullLengthAlignment} \ + ${par_hard_filter:+--hardFilter} \ + ${par_min_aln_prob:+--minAlnProb "${par_min_aln_prob}"} \ + ${par_write_mappings:+-z "${par_write_mappings}"} \ + ${par_write_qualities:+--writeQualities} \ + ${par_hit_filter_policy:+--hitFilterPolicy "${par_hit_filter_policy}"} \ + ${par_alternative_init_mode:+--alternativeInitMode} \ + ${par_aux_dir:+--auxDir "${par_aux_dir}"} \ + ${par_skip_quant:+--skipQuant} \ + ${par_dump_eq:+--dumpEq} \ + ${par_dump_eq_weights:+-d "${par_dump_eq_weights}"} \ + ${par_min_assigned_frags:+--minAssignedFrags "${par_min_assigned_frags}"} \ + ${par_reduce_GC_memory:+--reduceGCMemory} \ + ${par_bias_speed_samp:+--biasSpeedSamp "${par_bias_speed_samp}"} \ + ${par_fld_max:+--fldMax "${par_fld_max}"} \ + ${par_fld_mean:+--fldMean "${par_fld_mean}"} \ + ${par_fld_SD:+--fldSD "${par_fld_SD}"} \ + ${par_forgetting_factor:+-f "${par_forgetting_factor}"} \ + ${par_init_uniform:+--initUniform} \ + ${par_max_occs_per_hit:+--maxOccsPerHit "${par_max_occs_per_hit}"} \ + ${par_max_read_occ:+-w "${par_max_read_occ}"} \ + ${par_no_length_correction:+--noLengthCorrection} \ + ${par_no_effective_length_correction:+--noEffectiveLengthCorrection} \ + ${par_no_single_frag_prob:+--noSingleFragProb} \ + ${par_no_frag_length_dist:+--noFragLengthDist} \ + ${par_no_bias_length_threshold:+--noBiasLengthThreshold} \ + ${par_num_bias_samples:+--numBiasSamples "${par_num_bias_samples}"} \ + ${par_num_aux_model_samples:+--numAuxModelSamples "${par_num_aux_model_samples}"} \ + ${par_num_pre_aux_model_samples:+--numPreAuxModelSamples "${par_num_pre_aux_model_samples}"} \ + ${par_useEM:+--useEM} \ + ${par_useVBOpt:+--useVBOpt} \ + ${par_range_factorization_bins:+--rangeFactorizationBins "${par_range_factorization_bins}"} \ + ${par_num_Gibbs_samples:+--numGibbsSamples "${par_num_Gibbs_samples}"} \ + ${par_no_Gamma_draw:+--noGammaDraw} \ + ${par_num_bootstraps:+--numBootstraps "${par_num_bootstraps}"} \ + ${par_bootstrap_reproject:+--bootstrapReproject} \ + ${par_thinning_factor:+--thinningFactor "${par_thinning_factor}"} \ + ${par_quiet:+--quiet} \ + ${par_per_transcript_prior:+--perTranscriptPrior} \ + ${par_per_nucleotide_prior:+--perNucleotidePrior} \ + ${par_sig_digits:+--sigDigits "${par_sig_digits}"} \ + ${par_vb_prior:+--vbPrior "${par_vb_prior}"} \ + ${par_write_orphan_links:+--writeOrphanLinks} \ + ${par_write_unmapped_names:+--writeUnmappedNames} \ + ${par_no_error_model:+--noErrorModel} \ + ${par_num_error_bins:+--numErrorBins "${par_num_error_bins}"} \ + ${par_sample_out:+--sampleOut} \ + ${par_sample_unaligned:+--sampleUnaligned} \ + ${par_gencode:+--gencode} \ + ${par_mapping_cache_memory_limit:+--mappingCacheMemoryLimit "${par_mapping_cache_memory_limit}"} + +if [ -f "$par_output/quant.sf" ]; then + mv $par_output/quant.sf $par_quant_results +else + echo "Quantification file not generated!" +fi \ No newline at end of file diff --git a/src/salmon/salmon_quant/test.sh b/src/salmon/salmon_quant/test.sh new file mode 100644 index 00000000..54953a87 --- /dev/null +++ b/src/salmon/salmon_quant/test.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +set -e + +echo "===============================================================================" +echo "> Prepare test data" + +dir_in="test_data" +mkdir -p "$dir_in" + +cat > "$dir_in/transcriptome.fasta" <<'EOF' +>contig1 +AGCTCCAGATTCGCTCAGGCCCTTGATCATCAGTCGTCGTCGTCTTCGATTTGCCAGAGG +AGTTTAGATGAAGAATGTCAAGGATGTTCCTCCCTGCCCTCCCATCTAGCCAAGAACATT +TCCAAGAAGATAAAACTGTCACTGAGACAGGTCTGGATGCGCCCTAGGGGCAAATAGAGA +>contig2 +AGGCCTTTACCACATTGCTGCTGGCTATAGGAAGTCCCAGGTACTAGCCTGAAACAGCTG +ATATTTGGGGCTGTCACAGACAATATGGCCACCCCTTGGTCTTTATGCATGAAGATTATG +TAAAGGTTTTTATTAAAAAATATATATATATATATAAATGATCTAGATTATTTTCCTCTT +TCTGAAGTACTTTCTTAAAAAAATAAAATTAAATGTTTATAGTATTCCCGGT +EOF + +cat > "$dir_in/a_1.fq" <<'EOF' +@SEQ_ID1 +AGAATGTCAAGGATGTTCCTCC ++ +IIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ACCCGCAAGATTAGGCTCCGTA ++ +!!!!!!!!!!!!!!!!!!!!!! +@SEQ_ID3 +CTCAGGCCCTTGATCATCAGTC ++ +IIIIIIIIIIIIIIIIIIIIII +EOF + +cat > "$dir_in/a_2.fq" <<'EOF' +@SEQ_ID1 +GGAGGAACATCCTTGACATTCT ++ +IIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +GTGTACGGAGCCTAATCTTGCA ++ +!!!!!!!!!!!!!!!!!!!!!! +@SEQ_ID3 +GACTGATGATCAAGGGCCTGAG ++ +IIIIIIIIIIIIIIIIIIIIII +EOF + +cat > "$dir_in/b_1.fq" <<'EOF' +@SEQ_ID1 +CTTTACCACATTGCTGCTGGCT ++ +IIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ATTAGGCTCCGTAACCCGCAAG ++ +!!!!!!!!!!!!!!!!!!!!!! +@SEQ_ID3 +GCCACCCCTTGGTCTTTATGCA ++ +IIIIIIIIIIIIIIIIIIIIII +EOF + +cat > "$dir_in/b_2.fq" <<'EOF' +@SEQ_ID1 +AGCCAGCAGCAATGTGGTAAAG ++ +IIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +CTTGCGGGTTACGGAGCCTAAT ++ +!!!!!!!!!!!!!!!!!!!!!! +@SEQ_ID3 +TGCATAAAGACCAAGGGGTGGC ++ +IIIIIIIIIIIIIIIIIIIIII +EOF + +echo "===============================================================================" +echo "> Run salmon index" + +salmon index \ + --transcripts "$dir_in/transcriptome.fasta" \ + --index "$dir_in/index" \ + --kmerLen 11 + +echo "===============================================================================" +echo "> Run salmon quant for single-end reads" +"$meta_executable" \ + --lib_type "A" \ + --index "$dir_in/index" \ + --unmated_reads "$dir_in/a_1.fq" \ + --output "quant_se_results" \ + --quant_results "quant_se.sf" \ + --min_assigned_frags 1 + +echo ">> Checking output" +[ ! -d "quant_se_results" ] && echo "Output directory quant_se_results does not exist" && exit 1 +[ ! -f "quant_se.sf" ] && echo "Output file quant_se.sf does not exist!" && exit 1 +[ ! -s "quant_se.sf" ] && echo "Output file quant_se.sf is empty!" && exit 1 +grep -q "Name Length EffectiveLength TPM NumReads" "quant_se.sf" || (echo "Output file quant_se.sf does not have the right format!" && exit 1) +[ $(grep "contig1" "quant_se.sf" | cut -f 5) != '2.000' ] && echo "Number of reads mapping to contig1 does not match the expected value!" && exit 1 +[ $(grep "contig2" "quant_se.sf" | cut -f 5) != '0.000' ] && echo "Number of reads mapping to contig2 does not match the expected value!" && exit 1 +[ $(grep '"percent_mapped":' quant_se_results/aux_info/meta_info.json | cut -d':' -f 2) != '66.66666666666666,' ] && echo "Mapping rate does not match the expected value!" && exit 1 + +echo "===============================================================================" +echo "> Run salmon quant for paired-end reads" +"$meta_executable" \ + --lib_type "A" \ + --index "$dir_in/index" \ + --mates1 "$dir_in/a_1.fq" \ + --mates2 "$dir_in/a_2.fq" \ + --output "quant_pe_results" \ + --quant_results "quant_pe.sf" \ + --min_assigned_frags 1 + +echo ">> Checking output" +[ ! -d "quant_pe_results" ] && echo "Output directory quant_pe_results does not exist" && exit +[ ! -f "quant_pe.sf" ] && echo "Output file quant_pe.sf does not exist!" && exit 1 +[ ! -s "quant_pe.sf" ] && echo "Output file quant_pe.sf is empty!" && exit 1 +grep -q "Name Length EffectiveLength TPM NumReads" "quant_pe.sf" || (echo "Output file quant_pe.sf does not have the right format!" && exit 1) +[ $(grep "contig1" "quant_pe.sf" | cut -f 5) != '2.000' ] && echo "Number of reads mapping to contig1 does not match the expected value!" && exit 1 +[ $(grep "contig2" "quant_pe.sf" | cut -f 5) != '0.000' ] && echo "Number of reads mapping to contig2 does not match the expected value!" && exit 1 +[ $(grep '"percent_mapped":' quant_pe_results/aux_info/meta_info.json | cut -d':' -f 2) != '66.66666666666666,' ] && echo "Mapping rate does not match the expected value!" && exit 1 + +echo "===============================================================================" +echo "> Run salmon quant for paired-end reads with technical replicates" +"$meta_executable" \ + --lib_type "A" \ + --index "$dir_in/index" \ + --mates1 "$dir_in/a_1.fq;$dir_in/b_1.fq" \ + --mates2 "$dir_in/a_2.fq;$dir_in/b_2.fq" \ + --output "quant_pe_rep_results" \ + --quant_results "quant_pe_rep.sf" \ + --min_assigned_frags 1 + +echo ">> Checking output" +[ ! -d "quant_pe_rep_results" ] && echo "Output directory quant_pe_rep_results does not exist" && exit +[ ! -f "quant_pe_rep.sf" ] && echo "Output file quant_pe_rep.sf does not exist!" && exit 1 +[ ! -s "quant_pe_rep.sf" ] && echo "Output file quant_pe_rep.sf is empty!" && exit 1 +grep -q "Name Length EffectiveLength TPM NumReads" "quant_pe_rep.sf" || (echo "Output file quant_pe_rep.sf does not have the right format!" && exit 1) +[ $(grep "contig1" "quant_pe_rep.sf" | cut -f 5) != '2.000' ] && echo "Number of reads mapping to contig1 does not match the expected value!" && exit 1 +[ $(grep "contig2" "quant_pe_rep.sf" | cut -f 5) != '2.000' ] && echo "Number of reads mapping to contig2 does not match the expected value!" && exit 1 +[ $(grep '"percent_mapped":' quant_pe_rep_results/aux_info/meta_info.json | cut -d':' -f 2) != '66.66666666666666,' ] && echo "Mapping rate does not match the expected value!" && exit 1 + + +# TODO: check counts and mapping rates +# contig1 should have 2 reads, contig2 should have 2 reads +# mapping rate should be 66.6% + +echo "===============================================================================" +echo "> Test successful" \ No newline at end of file