viash-hub · rcannood · Jan 31, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 * `fastp`: An ultra-fast all-in-one FASTQ preprocessor (PR #3).
 
+* `featureCounts`: Assign sequence reads to genomic features. 
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES

diff --git a/src/featurecounts/config.vsh.yaml b/src/featurecounts/config.vsh.yaml
diff --git a/src/featurecounts/help.txt b/src/featurecounts/help.txt
@@ -0,0 +1,242 @@
+```bash
+featureCounts
+```
+
+Version 2.0.3
+
+Usage: featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... 
+
+## Mandatory arguments:
+
+  -a <string>         Name of an annotation file. GTF/GFF format by default. See
+                      -F option for more format information. Inbuilt annotations
+                      (SAF format) is available in 'annotation' directory of the
+                      package. Gzipped file is also accepted.
+
+  -o <string>         Name of output file including read counts. A separate file
+                      including summary statistics of counting results is also
+                      included in the output ('<string>.summary'). Both files
+                      are in tab delimited format.
+
+  input_file1 [input_file2] ...   A list of SAM or BAM format files. They can be
+                      either name or location sorted. If no files provided,
+                      <stdin> input is expected. Location-sorted paired-end reads
+                      are automatically sorted by read names.
+
+## Optional arguments:
+# Annotation
+
+  -F <string>         Specify format of the provided annotation file. Acceptable
+                      formats include 'GTF' (or compatible GFF format) and
+                      'SAF'. 'GTF' by default.  For SAF format, please refer to
+                      Users Guide.
+
+  -t <string>         Specify feature type(s) in a GTF annotation. If multiple
+                      types are provided, they should be separated by ',' with
+                      no space in between. 'exon' by default. Rows in the
+                      annotation with a matched feature will be extracted and
+                      used for read mapping. 
+
+  -g <string>         Specify attribute type in GTF annotation. 'gene_id' by 
+                      default. Meta-features used for read counting will be 
+                      extracted from annotation using the provided value.
+
+  --extraAttributes   Extract extra attribute types from the provided GTF
+                      annotation and include them in the counting output. These
+                      attribute types will not be used to group features. If
+                      more than one attribute type is provided they should be
+                      separated by comma.
+
+  -A <string>         Provide a chromosome name alias file to match chr names in
+                      annotation with those in the reads. This should be a two-
+                      column comma-delimited text file. Its first column should
+                      include chr names in the annotation and its second column
+                      should include chr names in the reads. Chr names are case
+                      sensitive. No column header should be included in the
+                      file.
+
+# Level of summarization
+
+  -f                  Perform read counting at feature level (eg. counting 
+                      reads for exons rather than genes).
+
+# Overlap between reads and features
+
+  -O                  Assign reads to all their overlapping meta-features (or 
+                      features if -f is specified).
+
+  --minOverlap <int>  Minimum number of overlapping bases in a read that is
+                      required for read assignment. 1 by default. Number of
+                      overlapping bases is counted from both reads if paired
+                      end. If a negative value is provided, then a gap of up
+                      to specified size will be allowed between read and the
+                      feature that the read is assigned to.
+
+  --fracOverlap <float> Minimum fraction of overlapping bases in a read that is
+                      required for read assignment. Value should be within range
+                      [0,1]. 0 by default. Number of overlapping bases is
+                      counted from both reads if paired end. Both this option
+                      and '--minOverlap' option need to be satisfied for read
+                      assignment.
+
+  --fracOverlapFeature <float> Minimum fraction of overlapping bases in a
+                      feature that is required for read assignment. Value
+                      should be within range [0,1]. 0 by default.
+
+  --largestOverlap    Assign reads to a meta-feature/feature that has the 
+                      largest number of overlapping bases.
+
+  --nonOverlap <int>  Maximum number of non-overlapping bases in a read (or a
+                      read pair) that is allowed when being assigned to a
+                      feature. No limit is set by default.
+
+  --nonOverlapFeature <int> Maximum number of non-overlapping bases in a feature
+                      that is allowed in read assignment. No limit is set by
+                      default.
+
+  --readExtension5 <int> Reads are extended upstream by <int> bases from their
+                      5' end.
+
+  --readExtension3 <int> Reads are extended upstream by <int> bases from their
+                      3' end.
+
+  --read2pos <5:3>    Reduce reads to their 5' most base or 3' most base. Read
+                      counting is then performed based on the single base the 
+                      read is reduced to.
+
+# Multi-mapping reads
+
+  -M                  Multi-mapping reads will also be counted. For a multi-
+                      mapping read, all its reported alignments will be 
+                      counted. The 'NH' tag in BAM/SAM input is used to detect 
+                      multi-mapping reads.
+
+# Fractional counting
+
+  --fraction          Assign fractional counts to features. This option must
+                      be used together with '-M' or '-O' or both. When '-M' is
+                      specified, each reported alignment from a multi-mapping
+                      read (identified via 'NH' tag) will carry a fractional
+                      count of 1/x, instead of 1 (one), where x is the total
+                      number of alignments reported for the same read. When '-O'
+                      is specified, each overlapping feature will receive a
+                      fractional count of 1/y, where y is the total number of
+                      features overlapping with the read. When both '-M' and
+                      '-O' are specified, each alignment will carry a fractional
+                      count of 1/(x*y).
+
+# Read filtering
+
+  -Q <int>            The minimum mapping quality score a read must satisfy in
+                      order to be counted. For paired-end reads, at least one
+                      end should satisfy this criteria. 0 by default.
+
+  --splitOnly         Count split alignments only (ie. alignments with CIGAR
+                      string containing 'N'). An example of split alignments is
+                      exon-spanning reads in RNA-seq data.
+
+  --nonSplitOnly      If specified, only non-split alignments (CIGAR strings do
+                      not contain letter 'N') will be counted. All the other
+                      alignments will be ignored.
+
+  --primary           Count primary alignments only. Primary alignments are 
+                      identified using bit 0x100 in SAM/BAM FLAG field.
+
+  --ignoreDup         Ignore duplicate reads in read counting. Duplicate reads 
+                      are identified using bit Ox400 in BAM/SAM FLAG field. The 
+                      whole read pair is ignored if one of the reads is a 
+                      duplicate read for paired end data.
+
+# Strandness
+
+  -s <int or string>  Perform strand-specific read counting. A single integer
+                      value (applied to all input files) or a string of comma-
+                      separated values (applied to each corresponding input
+                      file) should be provided. Possible values include:
+                      0 (unstranded), 1 (stranded) and 2 (reversely stranded).
+                      Default value is 0 (ie. unstranded read counting carried
+                      out for all input files).
+
+# Exon-exon junctions
+
+  -J                  Count number of reads supporting each exon-exon junction.
+                      Junctions were identified from those exon-spanning reads
+                      in the input (containing 'N' in CIGAR string). Counting
+                      results are saved to a file named '<output_file>.jcounts'
+
+  -G <string>         Provide the name of a FASTA-format file that contains the
+                      reference sequences used in read mapping that produced the
+                      provided SAM/BAM files. This optional argument can be used
+                      with '-J' option to improve read counting for junctions.
+
+# Parameters specific to paired end reads
+
+  -p                  Specify that input data contain paired-end reads. To
+                      perform fragment counting (ie. counting read pairs), the
+                      '--countReadPairs' parameter should also be specified in
+                      addition to this parameter.
+
+  --countReadPairs    Count read pairs (fragments) instead of reads. This option
+                      is only applicable for paired-end reads.
+
+  -B                  Only count read pairs that have both ends aligned.
+
+  -P                  Check validity of paired-end distance when counting read 
+                      pairs. Use -d and -D to set thresholds.
+
+  -d <int>            Minimum fragment/template length, 50 by default.
+
+  -D <int>            Maximum fragment/template length, 600 by default.
+
+  -C                  Do not count read pairs that have their two ends mapping 
+                      to different chromosomes or mapping to same chromosome 
+                      but on different strands.
+
+  --donotsort         Do not sort reads in BAM/SAM input. Note that reads from 
+                      the same pair are required to be located next to each 
+                      other in the input.
+
+# Number of CPU threads
+
+  -T <int>            Number of the threads. 1 by default.
+
+# Read groups
+
+  --byReadGroup       Assign reads by read group. "RG" tag is required to be
+                      present in the input BAM/SAM files.
+
+
+# Long reads
+
+  -L                  Count long reads such as Nanopore and PacBio reads. Long
+                      read counting can only run in one thread and only reads
+                      (not read-pairs) can be counted. There is no limitation on
+                      the number of 'M' operations allowed in a CIGAR string in
+                      long read counting.
+
+# Assignment results for each read
+
+  -R <format>         Output detailed assignment results for each read or read-
+                      pair. Results are saved to a file that is in one of the
+                      following formats: CORE, SAM and BAM. See Users Guide for
+                      more info about these formats.
+
+  --Rpath <string>    Specify a directory to save the detailed assignment
+                      results. If unspecified, the directory where counting
+                      results are saved is used.
+
+# Miscellaneous
+
+  --tmpDir <string>   Directory under which intermediate files are saved (later
+                      removed). By default, intermediate files will be saved to
+                      the directory specified in '-o' argument.
+
+  --maxMOp <int>      Maximum number of 'M' operations allowed in a CIGAR
+                      string. 10 by default. Both 'X' and '=' are treated as 'M'
+                      and adjacent 'M' operations are merged in the CIGAR
+                      string.
+
+  --verbose           Output verbose information for debugging, such as un-
+                      matched chromosome/contig names.
+
+  -v                  Output version of the program.
diff --git a/src/featurecounts/script.sh b/src/featurecounts/script.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+if [[ $par_tmpdir ]] && [[ ! -d "$par_tmpdir" ]]; then
+    mkdir -p $tmpDir
+fi
+
+if [[ $par_r_path ]] && [[ ! -d "$par_r_path" ]]; then
+    mkdir -p $par_r_path
+fi
+
+[[ "$par_feature_level" == "false" ]] && unset par_feature_level
+[[ "$par_overlapping" == "false" ]] && unset par_overlapping
+[[ "$par_largest_overlap" == "false" ]] && unset par_largest_overlap
+[[ "$par_multi_mapping" == "false" ]] && unset par_multi_mapping
+[[ "$par_fraction" == "false" ]] && unset par_fraction
+[[ "$par_split_only" == "false" ]] && unset par_split_only
+[[ "$par_non_split_only" == "false" ]] && unset par_non_split_only
+[[ "$par_primary" == "false" ]] && unset par_primary
+[[ "$par_ignore_dup" == "false" ]] && unset par_ignore_dup
+[[ "$par_junctions" == "false" ]] && unset par_junctions
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_count_read_pairs" == "false" ]] && unset par_count_read_pairs
+[[ "$par_both_aligned" == "false" ]] && unset par_both_aligned
+[[ "$par_check_pe_dist" == "false" ]] && unset par_check_pe_dist
+[[ "$par_same_strand" == "false" ]] && unset par_same_strand
+[[ "$par_donotsort" == "false" ]] && unset par_donotsort
+[[ "$par_by_read_group" == "false" ]] && unset par_by_read_group
+[[ "$par_long_reads" == "false" ]] && unset par_long_reads
+[[ "$par_verbose" == "false" ]] && unset par_verbose
+[[ "$par_version" == "false" ]] && unset par_version
+
+IFS="," read -ra input <<< $par_input
+
+featureCounts \
+  ${par_format:+-F "${par_format}"} \
+  ${par_feature_type:+-t "${par_feature_type}"} \
+  ${par_attribute_type:+-g "${par_attribute_type}"} \
+  ${par_extra_attributes:+--extraAttributes "${extra_attributes}"} \
+  ${par_chrom_alias:+-A "${par_chrom_alias}"} \
+  ${par_feature_level:+-f} \
+  ${par_overlapping:+-O} \
+  ${par_min_overlap:+--minOverlap "${par_min_overlap}"} \
+  ${par_frac_overlap:+--fracOverlap "${par_frac_overlap}"} \
+  ${par_frac_overlap_feature:+--fracOverlapFeature "${par_frac_overlap_feature}"} \
+  ${par_largest_overlap:+--largestOverlap} \
+  ${par_non_overlap:+--nonOverlap "${par_non_overlap}"} \
+  ${par_non_overlap_feature:+--nonOverlapFeature "${par_non_overlap_feature}"} \
+  ${par_read_extension5:+--readExtension5 "${par_read_extension5}"} \
+  ${par_read_extension3:+--readExtension3 "${par_read_extension3}"} \
+  ${par_read2pos:+--read2pos "${par_read2pos}"} \
+  ${par_multi_mapping:+-M} \
+  ${par_fraction:+--fraction} \
+  ${par_min_map_quality:+-Q "${par_min_map_quality}"} \
+  ${par_split_only:+--splitOnly} \
+  ${par_non_split_only:+--nonSplitOnly} \
+  ${par_primary:+--primary} \
+  ${par_ignore_dup:+--ignoreDup} \
+  ${par_strand:+-s "${par_strand}"} \
+  ${par_junctions:+-J} \
+  ${par_ref_fasta:+-G "${par_ref_fasta}"} \
+  ${par_paired:+-p} \
+  ${par_count_read_pairs:+--countReadPairs} \
+  ${par_both_aligned:+-B} \
+  ${par_check_pe_dist:+-P} \
+  ${par_min_length:+-d "${par_min_length}"} \
+  ${par_max_length:+-D "${par_max_length}"} \
+  ${par_same_strand:+-C} \
+  ${par_donotsort:+--donotsort} \
+  ${par_by_read_group:+--byReadGroup} \
+  ${par_long_reads:+-L} \
+  ${par_r_path:+--r_path "${par_r_path}"} \
+  ${par_detailed_results_format:+-R "${par_detailed_results_format}"} \
+  ${par_tmpdir:+--tmpDir "${par_tmpdir}"} \
+  ${par_max_M_op:+--maxMOp "${par_max_M_op}"} \
+  ${par_verbose:+--verbose} \
+  ${par_version:+-v} \
+  ${meta_cpus:+-T "${meta_cpus}"} \
+  -a "$par_annotation" \
+  -o "$par_output" \
+  "${input[*]}"
+
+[[ ! -z "$par_output_summary" ]] && mv "$par_output.summary" "$par_output_summary"
+[[ ! -z "$par_output_junctions" ]] && mv "$par_output.jcounts" "$par_output_junctions"
diff --git a/src/featurecounts/test.sh b/src/featurecounts/test.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -e
+
+dir_in="$meta_resources_dir/test_data"
+
+echo "> Run featurCounts"
+"$meta_executable" \
+  --input "$dir_in/a.bam" \
+  --annotation "$dir_in/annotation.gtf" \
+  --output "features.txt" \
+  --output_summary "features.txt.summary" \
+  --output_junctions "features.txt.jcounts" \
+  --junctions \
+  --ref_fasta "$dir_in/genome.fasta" \
+  --overlapping \
+  --frac_overlap 0.2 \
+  --paired \
+  --strand 0
+
+echo ">> Checking output"
+[ ! -f "features.txt" ] && echo "Output file features.txt does not exist" && exit 1
+[ ! -f "features.txt.summary" ] && echo "Output file features.txt.summary does not exist" && exit 1
+[ ! -f "features.txt.jcounts" ] && echo "Output file features.txt.jcounts does not exist" && exit 1
+
+echo ">> Check if output is empty"
+[ ! -s "features.txt" ] && echo "Output file features.txt is empty" && exit 1
+[ ! -s "features.txt.summary" ] && echo "Output file features.txt.summary is empty" && exit 1
+[ ! -s "features.txt.jcounts" ] && echo "Output file features.txt.jcounts is empty" && exit 1
+
+echo "> Test successful"
diff --git a/src/featurecounts/test_data/a.bam b/src/featurecounts/test_data/a.bam
diff --git a/src/featurecounts/test_data/annotation.gtf b/src/featurecounts/test_data/annotation.gtf
@@ -0,0 +1,6 @@
+1	havana	gene	1	80	.	+	.	gene_id "ENSG00000000000"; gene_version "5"; gene_name "A"; gene_source "havana"; gene_biotype "gene";
+1	havana	transcript	1	80	.	+	.	gene_id "ENSG00000000000"; gene_version "5"; transcript_id "ENST00000000000"; transcript_version "2"; gene_name "A"; gene_source "havana"; gene_biotype "gene"; transcript_name "A-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1";
+1	havana	exon	1	80	.	+	.	gene_id "ENSG00000000000"; gene_version "5"; transcript_id "ENST00000000000"; transcript_version "2"; exon_number "1"; gene_name "A"; gene_source "havana"; gene_biotype "gene"; transcript_name "A-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00000000000"; exon_version "1"; tag "basic"; transcript_support_level "1";
+2	havana	gene	1	80	.	+	.	gene_id "ENSG00000000001"; gene_version "5"; gene_name "B"; gene_source "havana"; gene_biotype "gene";
+2	havana	transcript	1	80	.	+	.	gene_id "ENSG00000000001"; gene_version "5"; transcript_id "ENST00000000001"; transcript_version "2"; gene_name "B"; gene_source "havana"; gene_biotype "gene"; transcript_name "B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1";
+2	havana	exon	1	80	.	+	.	gene_id "ENSG00000000001"; gene_version "5"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "1"; gene_name "B"; gene_source "havana"; gene_biotype "gene"; transcript_name "B-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00000000001"; exon_version "1"; tag "basic"; transcript_support_level "1";
diff --git a/src/featurecounts/test_data/genome.fasta b/src/featurecounts/test_data/genome.fasta
@@ -0,0 +1,4 @@
+>1
+GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+>2
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
diff --git a/src/featurecounts/test_data/script.sh b/src/featurecounts/test_data/script.sh
@@ -0,0 +1,9 @@
+# featureCounts test data
+
+# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/subread/featurecounts/test
+
+if [ ! -d /tmp/snakemake-wrappers ]; then
+  git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
+fi
+
+cp -r /tmp/snakemake-wrappers/bio/subread/featurecounts/test/* src/subread/featurecounts/test_data