From 3e0926ce5ee1d2c58468158f18adda53127a5fdb Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Wed, 31 Jan 2024 12:40:42 +0100 Subject: [PATCH] Add subread featurecounts (#11) * Add test data * featureCounts help * get test data * config and sript * test script * add to changelog * update description * move out of subreads directory and fix typos * add PR number * Use full argument names in descriptions * update * Update output arguments Co-authored-by: Robrecht Cannoodt * remove tmpdir argument Co-authored-by: Robrecht Cannoodt * use $meta_tmp_dir Co-authored-by: Robrecht Cannoodt * add quotes Co-authored-by: Robrecht Cannoodt * rename r_path to results_path * modified arguments * fix incorrect argument in description * fix --output_counts argument * use multiple_sep Co-authored-by: Robrecht Cannoodt * Update src/featurecounts/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * use example instead of default * minor edits * rename files to verify that parameters work as intended * Automatically add the `-J` flag if `--output_junctions` was specified * fix typo * rename arguments and let featureCounts first output to a tempdir --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + src/featurecounts/config.vsh.yaml | 337 +++++++++++++++++++++ src/featurecounts/help.txt | 242 +++++++++++++++ src/featurecounts/script.sh | 85 ++++++ src/featurecounts/test.sh | 35 +++ src/featurecounts/test_data/a.bam | Bin 0 -> 296 bytes src/featurecounts/test_data/annotation.gtf | 6 + src/featurecounts/test_data/genome.fasta | 4 + src/featurecounts/test_data/script.sh | 9 + 9 files changed, 720 insertions(+) create mode 100644 src/featurecounts/config.vsh.yaml create mode 100644 src/featurecounts/help.txt create mode 100644 src/featurecounts/script.sh create mode 100644 src/featurecounts/test.sh create mode 100644 src/featurecounts/test_data/a.bam create mode 100644 src/featurecounts/test_data/annotation.gtf create mode 100644 src/featurecounts/test_data/genome.fasta create mode 100644 src/featurecounts/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index d7262a74..e5a98c1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ * `bgzip`: Add bgzip functionality to compress and decompress files (PR #13). +* `featurecounts`: Assign sequence reads to genomic features (PR #11). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/featurecounts/config.vsh.yaml b/src/featurecounts/config.vsh.yaml new file mode 100644 index 00000000..8a2002d4 --- /dev/null +++ b/src/featurecounts/config.vsh.yaml @@ -0,0 +1,337 @@ +functionality: + name: featurecounts + description: | + featureCounts is a read summarization program for counting reads generated from either RNA or genomic DNA sequencing experiments by implementing highly efficient chromosome hashing and feature blocking techniques. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications. + info: + keywords: ["Read counting", "Genomic features"] + homepage: https://subread.sourceforge.net/ + documentation: https://subread.sourceforge.net/SubreadUsersGuide.pdf + repository: https://github.com/ShiLab-Bioinformatics/subread + reference: "doi:10.1093/bioinformatics/btt656" + licence: GPL-3.0 + requirements: + commands: [ featureCounts ] + + argument_groups: + - name: Inputs + arguments: + - name: --annotation + alternatives: ["-a"] + type: file + description: | + Name of an annotation file. GTF/GFF format by default. See '--format' option for more format information. + required: true + example: annotation.gtf + - name: --input + alternatives: ["-i"] + type: file + multiple: true + multiple_sep: ';' + description: | + A list of SAM or BAM format files separated by semi-colon (;). They can be either name or location sorted. Location-sorted paired-end reads are automatically sorted by read names. + required: true + example: input_file1.bam + + - name: Outputs + arguments: + - name: --counts + alternatives: ["-o"] + type: file + direction: output + description: | + Name of output file including read counts in tab delimited format. + required: true + example: features.tsv + - name: --summary + type: file + direction: output + description: | + Summary statistics of counting results in tab delimited format. + required: false + example: summary.tsv + - name: --junctions + type: file + direction: output + description: | + Count number of reads supporting each exon-exon junction. Junctions were identified from those exon-spanning reads in the input (containing 'N' in CIGAR string). + example: junctions.txt + required: false + + - name: Annotation + arguments: + - name: --format + alternatives: ["-F"] + type: string + description: | + Specify format of the provided annotation file. Acceptable formats include 'GTF' (or compatible GFF format) and 'SAF'. 'GTF' by default. + choices: [GTF, GFF, SAF] + example: "GTF" + required: false + - name: --feature_type + alternatives: ["-t"] + type: string + description: | + Specify feature type(s) in a GTF annotation. If multiple types are provided, they should be separated by ',' with no space in between. 'exon' by default. Rows in the annotation with a matched feature will be extracted and used for read mapping. + example: "exon" + required: false + multiple: true + multiple_sep: "," + - name: --attribute_type + alternatives: ["-g"] + type: string + description: | + Specify attribute type in GTF annotation. 'gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value. + example: "gene_id" + required: false + - name: --extra_attributes + type: string + description: | + Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by comma. + required: false + multiple: true + multiple_sep: "," + - name: --chrom_alias + alternatives: ["-A"] + type: file + description: | + Provide a chromosome name alias file to match chr names in annotation with those in the reads. This should be a two-column comma-delimited text file. Its first column should include chr names in the annotation and its second column should include chr names in the reads. Chr names are case sensitive. No column header should be included in the file. + required: false + example: chrom_alias.csv + + - name: Level of summarization + arguments: + - name: --feature_level + alternatives: ["-f"] + type: boolean_true + description: | + Perform read counting at feature level (eg. counting reads for exons rather than genes). + + - name: Overlap between reads and features + arguments: + - name: --overlapping + alternatives: ["-O"] + type: boolean_true + description: | + Assign reads to all their overlapping meta-features (or features if '--feature_level' is specified). + - name: --min_overlap + type: integer + description: | + Minimum number of overlapping bases in a read that is required for read assignment. 1 by default. Number of overlapping bases is counted from both reads if paired end. If a negative value is provided, then a gap of up to specified size will be allowed between read and the feature that the read is assigned to. + required: false + example: 1 + - name: --frac_overlap + type: double + description: | + Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--min_overlap' option need to be satisfied for read assignment. + required: false + min: 0 + max: 1 + example: 0 + - name: --frac_overlap_feature + type: double + description: | + Minimum fraction of overlapping bases in a feature that is required for read assignment. Value should be within range [0,1]. 0 by default. + required: false + min: 0 + max: 1 + example: 0 + - name: --largest_overlap + type: boolean_true + description: | + Assign reads to a meta-feature/feature that has the largest number of overlapping bases. + - name: --non_overlap + type: integer + description: | + Maximum number of non-overlapping bases in a read (or a read pair) that is allowed when being assigned to a feature. No limit is set by default. + required: false + - name: --non_overlap_feature + type: integer + description: | + Maximum number of non-overlapping bases in a feature that is allowed in read assignment. No limit is set by default. + required: false + - name: --read_extension5 + type: integer + description: | + Reads are extended upstream by bases from their 5' end. + required: false + - name: --read_extension3 + type: integer + description: | + Reads are extended upstream by bases from their 3' end. + required: false + - name: --read2pos + type: integer + description: | + Reduce reads to their 5' most base or 3' most base. Read counting is then performed based on the single base the read is reduced to. + required: false + choices: [3, 5] + + - name: Multi-mapping reads + arguments: + - name: --multi_mapping + alternatives: ["-M"] + type: boolean_true + description: | + Multi-mapping reads will also be counted. For a multi-mapping read, all its reported alignments will be counted. The 'NH' tag in BAM/SAM input is used to detect multi-mapping reads. + + - name: Fractional counting + arguments: + - name: --fraction + type: boolean_true + description: | + Assign fractional counts to features. This option must be used together with '--multi_mapping' or '--overlapping' or both. When '--multi_mapping' is specified, each reported alignment from a multi-mapping read (identified via 'NH' tag) will carry a fractional count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When '--overlapping' is specified, each overlapping feature will receive a fractional count of 1/y, where y is the total number of features overlapping with the read. When both '--multi_mapping' and '--overlapping' are specified, each alignment will carry a fractional count of 1/(x*y). + + - name: Read filtering + arguments: + - name: --min_map_quality + alternatives: ["-Q"] + type: integer + description: | + The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default. + required: false + example: 0 + - name: --split_only + type: boolean_true + description: | + Count split alignments only (ie. alignments with CIGAR string containing 'N'). An example of split alignments is exon-spanning reads in RNA-seq data. + - name: --non_split_only + type: boolean_true + description: | + If specified, only non-split alignments (CIGAR strings do not contain letter 'N') will be counted. All the other alignments will be ignored. + - name: --primary + type: boolean_true + description: | + Count primary alignments only. Primary alignments are identified using bit 0x100 in SAM/BAM FLAG field. + - name: --ignore_dup + type: boolean_true + description: | + Ignore duplicate reads in read counting. Duplicate reads are identified using bit Ox400 in BAM/SAM FLAG field. The whole read pair is ignored if one of the reads is a duplicate read for paired end data. + + - name: Strandedness + arguments: + - name: --strand + alternatives: ["-s"] + type: integer + description: | + Perform strand-specific read counting. A single integer value (applied to all input files) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). + choices: [0, 1, 2] + example: 0 + required: false + + - name: Exon-exon junctions + arguments: + - name: --ref_fasta + alternatives: ["-G"] + type: file + description: | + Provide the name of a FASTA-format file that contains the reference sequences used in read mapping that produced the provided SAM/BAM files. + required: false + example: reference.fasta + + - name: Parameters specific to paired end reads + arguments: + - name: --paired + alternatives: ["-p"] + type: boolean_true + description: | + Specify that input data contain paired-end reads. To perform fragment counting (ie. counting read pairs), the '--countReadPairs' parameter should also be specified in addition to this parameter. + - name: --count_read_pairs + type: boolean_true + description: | + Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. + - name: --both_aligned + alternatives: ["-B"] + type: boolean_true + description: | + Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. + - name: --check_pe_dist + alternatives: ["-P"] + type: boolean_true + description: | + Check validity of paired-end distance when counting read pairs. Use '--min_length' and '--max_length' to set thresholds. + - name: --min_length + alternatives: ["-d"] + type: integer + description: | + Minimum fragment/template length, 50 by default. + required: false + example: 50 + - name: --max_length + alternatives: ["-D"] + type: integer + description: | + Maximum fragment/template length, 600 by default. + required: false + example: 600 + - name: --same_strand + alternatives: ["-C"] + type: boolean_true + description: | + Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands. + - name: --donotsort + type: boolean_true + description: | + Do not sort reads in BAM/SAM input. Note that reads from the same pair are required to be located next to each other in the input. + + - name: Read groups + arguments: + - name: --by_read_group + type: boolean_true + description: | + Assign reads by read group. "RG" tag is required to be present in the input BAM/SAM files. + + - name: Long reads + arguments: + - name: --long_reads + type: boolean_true + description: | + Count long reads such as Nanopore and PacBio reads. Long read counting can only run in one thread and only reads (not read-pairs) can be counted. There is no limitation on the number of 'M' operations allowed in a CIGAR string in long read counting. + + - name: Assignment results for each read + arguments: + - name: --detailed_results + type: file + direction: output + description: | + Directory to save the detailed assignment results. Use `--detailed_results_format` to determine the format of the detailed results. + example: detailed_results/ + required: false + - name: --detailed_results_format + alternatives: ["-R"] + type: string + description: | + Output detailed assignment results for each read or read-pair. Results are saved to a file that is in one of the following formats: CORE, SAM and BAM. See documentaiton for more info about these formats. + required: false + choices: [CORE, SAM, BAM] + + - name: Miscellaneous + arguments: + - name: --max_M_op + type: integer + description: | + Maximum number of 'M' operations allowed in a CIGAR string. 10 by default. Both 'X' and '=' are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string. + required: false + example: 10 + - name: --verbose + type: boolean_true + description: | + Output verbose information for debugging, such as un-matched chromosome/contig names. + + resources: + - type: bash_script + path: script.sh + + test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +platforms: + - type: docker + image: quay.io/biocontainers/subread:2.0.6--he4a0461_0 + setup: + - type: docker + run: | + featureCounts -v 2>&1 | sed 's/featureCounts v\([0-9.]*\)/featureCounts: \1/' > /var/software_versions.txt + - type: nextflow \ No newline at end of file diff --git a/src/featurecounts/help.txt b/src/featurecounts/help.txt new file mode 100644 index 00000000..9ad33331 --- /dev/null +++ b/src/featurecounts/help.txt @@ -0,0 +1,242 @@ +```bash +featureCounts +``` + +Version 2.0.3 + +Usage: featureCounts [options] -a -o input_file1 [input_file2] ... + +## Mandatory arguments: + + -a Name of an annotation file. GTF/GFF format by default. See + -F option for more format information. Inbuilt annotations + (SAF format) is available in 'annotation' directory of the + package. Gzipped file is also accepted. + + -o Name of output file including read counts. A separate file + including summary statistics of counting results is also + included in the output ('.summary'). Both files + are in tab delimited format. + + input_file1 [input_file2] ... A list of SAM or BAM format files. They can be + either name or location sorted. If no files provided, + input is expected. Location-sorted paired-end reads + are automatically sorted by read names. + +## Optional arguments: +# Annotation + + -F Specify format of the provided annotation file. Acceptable + formats include 'GTF' (or compatible GFF format) and + 'SAF'. 'GTF' by default. For SAF format, please refer to + Users Guide. + + -t Specify feature type(s) in a GTF annotation. If multiple + types are provided, they should be separated by ',' with + no space in between. 'exon' by default. Rows in the + annotation with a matched feature will be extracted and + used for read mapping. + + -g Specify attribute type in GTF annotation. 'gene_id' by + default. Meta-features used for read counting will be + extracted from annotation using the provided value. + + --extraAttributes Extract extra attribute types from the provided GTF + annotation and include them in the counting output. These + attribute types will not be used to group features. If + more than one attribute type is provided they should be + separated by comma. + + -A Provide a chromosome name alias file to match chr names in + annotation with those in the reads. This should be a two- + column comma-delimited text file. Its first column should + include chr names in the annotation and its second column + should include chr names in the reads. Chr names are case + sensitive. No column header should be included in the + file. + +# Level of summarization + + -f Perform read counting at feature level (eg. counting + reads for exons rather than genes). + +# Overlap between reads and features + + -O Assign reads to all their overlapping meta-features (or + features if -f is specified). + + --minOverlap Minimum number of overlapping bases in a read that is + required for read assignment. 1 by default. Number of + overlapping bases is counted from both reads if paired + end. If a negative value is provided, then a gap of up + to specified size will be allowed between read and the + feature that the read is assigned to. + + --fracOverlap Minimum fraction of overlapping bases in a read that is + required for read assignment. Value should be within range + [0,1]. 0 by default. Number of overlapping bases is + counted from both reads if paired end. Both this option + and '--minOverlap' option need to be satisfied for read + assignment. + + --fracOverlapFeature Minimum fraction of overlapping bases in a + feature that is required for read assignment. Value + should be within range [0,1]. 0 by default. + + --largestOverlap Assign reads to a meta-feature/feature that has the + largest number of overlapping bases. + + --nonOverlap Maximum number of non-overlapping bases in a read (or a + read pair) that is allowed when being assigned to a + feature. No limit is set by default. + + --nonOverlapFeature Maximum number of non-overlapping bases in a feature + that is allowed in read assignment. No limit is set by + default. + + --readExtension5 Reads are extended upstream by bases from their + 5' end. + + --readExtension3 Reads are extended upstream by bases from their + 3' end. + + --read2pos <5:3> Reduce reads to their 5' most base or 3' most base. Read + counting is then performed based on the single base the + read is reduced to. + +# Multi-mapping reads + + -M Multi-mapping reads will also be counted. For a multi- + mapping read, all its reported alignments will be + counted. The 'NH' tag in BAM/SAM input is used to detect + multi-mapping reads. + +# Fractional counting + + --fraction Assign fractional counts to features. This option must + be used together with '-M' or '-O' or both. When '-M' is + specified, each reported alignment from a multi-mapping + read (identified via 'NH' tag) will carry a fractional + count of 1/x, instead of 1 (one), where x is the total + number of alignments reported for the same read. When '-O' + is specified, each overlapping feature will receive a + fractional count of 1/y, where y is the total number of + features overlapping with the read. When both '-M' and + '-O' are specified, each alignment will carry a fractional + count of 1/(x*y). + +# Read filtering + + -Q The minimum mapping quality score a read must satisfy in + order to be counted. For paired-end reads, at least one + end should satisfy this criteria. 0 by default. + + --splitOnly Count split alignments only (ie. alignments with CIGAR + string containing 'N'). An example of split alignments is + exon-spanning reads in RNA-seq data. + + --nonSplitOnly If specified, only non-split alignments (CIGAR strings do + not contain letter 'N') will be counted. All the other + alignments will be ignored. + + --primary Count primary alignments only. Primary alignments are + identified using bit 0x100 in SAM/BAM FLAG field. + + --ignoreDup Ignore duplicate reads in read counting. Duplicate reads + are identified using bit Ox400 in BAM/SAM FLAG field. The + whole read pair is ignored if one of the reads is a + duplicate read for paired end data. + +# Strandness + + -s Perform strand-specific read counting. A single integer + value (applied to all input files) or a string of comma- + separated values (applied to each corresponding input + file) should be provided. Possible values include: + 0 (unstranded), 1 (stranded) and 2 (reversely stranded). + Default value is 0 (ie. unstranded read counting carried + out for all input files). + +# Exon-exon junctions + + -J Count number of reads supporting each exon-exon junction. + Junctions were identified from those exon-spanning reads + in the input (containing 'N' in CIGAR string). Counting + results are saved to a file named '.jcounts' + + -G Provide the name of a FASTA-format file that contains the + reference sequences used in read mapping that produced the + provided SAM/BAM files. This optional argument can be used + with '-J' option to improve read counting for junctions. + +# Parameters specific to paired end reads + + -p Specify that input data contain paired-end reads. To + perform fragment counting (ie. counting read pairs), the + '--countReadPairs' parameter should also be specified in + addition to this parameter. + + --countReadPairs Count read pairs (fragments) instead of reads. This option + is only applicable for paired-end reads. + + -B Only count read pairs that have both ends aligned. + + -P Check validity of paired-end distance when counting read + pairs. Use -d and -D to set thresholds. + + -d Minimum fragment/template length, 50 by default. + + -D Maximum fragment/template length, 600 by default. + + -C Do not count read pairs that have their two ends mapping + to different chromosomes or mapping to same chromosome + but on different strands. + + --donotsort Do not sort reads in BAM/SAM input. Note that reads from + the same pair are required to be located next to each + other in the input. + +# Number of CPU threads + + -T Number of the threads. 1 by default. + +# Read groups + + --byReadGroup Assign reads by read group. "RG" tag is required to be + present in the input BAM/SAM files. + + +# Long reads + + -L Count long reads such as Nanopore and PacBio reads. Long + read counting can only run in one thread and only reads + (not read-pairs) can be counted. There is no limitation on + the number of 'M' operations allowed in a CIGAR string in + long read counting. + +# Assignment results for each read + + -R Output detailed assignment results for each read or read- + pair. Results are saved to a file that is in one of the + following formats: CORE, SAM and BAM. See Users Guide for + more info about these formats. + + --Rpath Specify a directory to save the detailed assignment + results. If unspecified, the directory where counting + results are saved is used. + +# Miscellaneous + + --tmpDir Directory under which intermediate files are saved (later + removed). By default, intermediate files will be saved to + the directory specified in '-o' argument. + + --maxMOp Maximum number of 'M' operations allowed in a CIGAR + string. 10 by default. Both 'X' and '=' are treated as 'M' + and adjacent 'M' operations are merged in the CIGAR + string. + + --verbose Output verbose information for debugging, such as un- + matched chromosome/contig names. + + -v Output version of the program. \ No newline at end of file diff --git a/src/featurecounts/script.sh b/src/featurecounts/script.sh new file mode 100644 index 00000000..b8da615f --- /dev/null +++ b/src/featurecounts/script.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +## VIASH START +## VIASH END + +tmp_dir=$(mktemp -d -p "$meta_temp_dir" "${meta_functionality_name}_XXXXXX") +mkdir -p "$tmp_dir/temp" + +if [[ $par_detailed_results ]] && [[ ! -d "$par_detailed_results" ]]; then + mkdir -p "$par_detailed_results" +fi + +[[ "$par_feature_level" == "false" ]] && unset par_feature_level +[[ "$par_overlapping" == "false" ]] && unset par_overlapping +[[ "$par_largest_overlap" == "false" ]] && unset par_largest_overlap +[[ "$par_multi_mapping" == "false" ]] && unset par_multi_mapping +[[ "$par_fraction" == "false" ]] && unset par_fraction +[[ "$par_split_only" == "false" ]] && unset par_split_only +[[ "$par_non_split_only" == "false" ]] && unset par_non_split_only +[[ "$par_primary" == "false" ]] && unset par_primary +[[ "$par_ignore_dup" == "false" ]] && unset par_ignore_dup +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_count_read_pairs" == "false" ]] && unset par_count_read_pairs +[[ "$par_both_aligned" == "false" ]] && unset par_both_aligned +[[ "$par_check_pe_dist" == "false" ]] && unset par_check_pe_dist +[[ "$par_same_strand" == "false" ]] && unset par_same_strand +[[ "$par_donotsort" == "false" ]] && unset par_donotsort +[[ "$par_by_read_group" == "false" ]] && unset par_by_read_group +[[ "$par_long_reads" == "false" ]] && unset par_long_reads +[[ "$par_verbose" == "false" ]] && unset par_verbose + +IFS=";" read -ra input <<< $par_input + +featureCounts \ + ${par_format:+-F "${par_format}"} \ + ${par_feature_type:+-t "${par_feature_type}"} \ + ${par_attribute_type:+-g "${par_attribute_type}"} \ + ${par_extra_attributes:+--extraAttributes "${extra_attributes}"} \ + ${par_chrom_alias:+-A "${par_chrom_alias}"} \ + ${par_feature_level:+-f} \ + ${par_overlapping:+-O} \ + ${par_min_overlap:+--minOverlap "${par_min_overlap}"} \ + ${par_frac_overlap:+--fracOverlap "${par_frac_overlap}"} \ + ${par_frac_overlap_feature:+--fracOverlapFeature "${par_frac_overlap_feature}"} \ + ${par_largest_overlap:+--largestOverlap} \ + ${par_non_overlap:+--nonOverlap "${par_non_overlap}"} \ + ${par_non_overlap_feature:+--nonOverlapFeature "${par_non_overlap_feature}"} \ + ${par_read_extension5:+--readExtension5 "${par_read_extension5}"} \ + ${par_read_extension3:+--readExtension3 "${par_read_extension3}"} \ + ${par_read2pos:+--read2pos "${par_read2pos}"} \ + ${par_multi_mapping:+-M} \ + ${par_fraction:+--fraction} \ + ${par_min_map_quality:+-Q "${par_min_map_quality}"} \ + ${par_split_only:+--splitOnly} \ + ${par_non_split_only:+--nonSplitOnly} \ + ${par_primary:+--primary} \ + ${par_ignore_dup:+--ignoreDup} \ + ${par_strand:+-s "${par_strand}"} \ + ${par_junctions:+-J} \ + ${par_ref_fasta:+-G "${par_ref_fasta}"} \ + ${par_paired:+-p} \ + ${par_count_read_pairs:+--countReadPairs} \ + ${par_both_aligned:+-B} \ + ${par_check_pe_dist:+-P} \ + ${par_min_length:+-d "${par_min_length}"} \ + ${par_max_length:+-D "${par_max_length}"} \ + ${par_same_strand:+-C} \ + ${par_donotsort:+--donotsort} \ + ${par_by_read_group:+--byReadGroup} \ + ${par_long_reads:+-L} \ + ${par_detailed_results:+--Rpath "${par_detailed_results}"} \ + ${par_detailed_results_format:+-R "${par_detailed_results_format}"} \ + ${par_max_M_op:+--maxMOp "${par_max_M_op}"} \ + ${par_verbose:+--verbose} \ + ${meta_cpus:+-T "${meta_cpus}"} \ + --tmpDir "$tmp_dir/temp" \ + -a "$par_annotation" \ + -o "$tmp_dir/output.txt" \ + "${input[*]}" + +[[ ! -z "$par_counts" ]] && mv "$tmp_dir/output.txt" "$par_counts" +[[ ! -z "$par_summary" ]] && mv "$tmp_dir/output.txt.summary" "$par_summary" +[[ ! -z "$par_junctions" ]] && mv "$tmp_dir/output.txt.jcounts" "$par_junctions" diff --git a/src/featurecounts/test.sh b/src/featurecounts/test.sh new file mode 100644 index 00000000..26494f98 --- /dev/null +++ b/src/featurecounts/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +dir_in="$meta_resources_dir/test_data" + +echo "> Run featureCounts" +"$meta_executable" \ + --input "$dir_in/a.bam" \ + --annotation "$dir_in/annotation.gtf" \ + --counts "features.tsv" \ + --summary "summary.tsv" \ + --junctions "junction_counts.txt" \ + --ref_fasta "$dir_in/genome.fasta" \ + --overlapping \ + --frac_overlap 0.2 \ + --paired \ + --strand 0 \ + --detailed_results detailed_results \ + --detailed_results_format SAM + +echo ">> Checking output" +[ ! -f "features.tsv" ] && echo "Output file features.tsv does not exist" && exit 1 +[ ! -f "summary.tsv" ] && echo "Output file summary.tsv does not exist" && exit 1 +[ ! -f "junction_counts.txt" ] && echo "Output file junction_counts.txt does not exist" && exit 1 +[ ! -d "detailed_results" ] && echo "Output directory detailed_results does not exist" && exit 1 +[ ! -f "detailed_results/a.bam.featureCounts.sam" ] && echo "Output file detailed_results/a.bam.featureCounts.sam does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "features.tsv" ] && echo "Output file features.tsv is empty" && exit 1 +[ ! -s "summary.tsv" ] && echo "Output file summary.tsv is empty" && exit 1 +[ ! -s "junction_counts.txt" ] && echo "Output file junction_counts.txt is empty" && exit 1 +[ ! -s "detailed_results/a.bam.featureCounts.sam" ] && echo "Output file detailed_results/a.bam.featureCounts.sam is empty" && exit 1 + +echo "> Test successful" \ No newline at end of file diff --git a/src/featurecounts/test_data/a.bam b/src/featurecounts/test_data/a.bam new file mode 100644 index 0000000000000000000000000000000000000000..57511ab3537e519b0b82cc84d30637c43db5801e GIT binary patch literal 296 zcmb2|=3rp}f&Xj_PR>jW4GhIaUsBJcB_tGlD0s;8d9%?KKQQ<5<)c~_M+`fR5AZuNn@YW$`C3w~$m(~4>kgSYe=WVcgS$m0eJ(B*{v0Gy zWb`>`t;m5QABWctHzp`DoHAnC%-}Jhqp=%kuRNLqx)@fcG%)xnq;3Bwl9ZzHWI{{V zO6SacW@csY;x^769v(f91CGMtA3hhPefdz5R>D&zcH(ns+6U%G3CA^*WtEv_b7hr- z<7JJRWtp?vycRrkaF%R&I8EKy!|CXmh9BJaT~C`l92z7}mn%f(tw`?3vSeueGBM~E N7N1 +GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG +>2 +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/src/featurecounts/test_data/script.sh b/src/featurecounts/test_data/script.sh new file mode 100644 index 00000000..28472b0e --- /dev/null +++ b/src/featurecounts/test_data/script.sh @@ -0,0 +1,9 @@ +# featureCounts test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/subread/featurecounts/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/subread/featurecounts/test/* src/subread/featurecounts/test_data \ No newline at end of file