diff --git a/CHANGELOG.md b/CHANGELOG.md index bd6a639a..709ca4a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,8 @@ * `star/star_align_reads`: Align reads to a reference genome (PR #22). +* `gffread`: Validate, filter, convert and perform other operations on GFF files (PR #29). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml new file mode 100644 index 00000000..e4ee2fd3 --- /dev/null +++ b/src/gffread/config.vsh.yaml @@ -0,0 +1,399 @@ +name: gffread +namespace: gffread +description: Validate, filter, convert and perform various other operations on GFF files. +keywords: [gff, conversion, validation, filtering] +links: + homepage: https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: https://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + repository: https://github.com/gpertea/gffread +references: + doi: 10.12688/f1000research.23297.2 +license: MIT +requirements: + commands: [ gffread ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + description: | + A reference file in either the GFF3, GFF2 or GTF format. + required: true + example: annotation.gff + - name: --chr_mapping + alternatives: -m + type: file + direction: input + description: | + is a name mapping table for converting reference sequence names, + having this 2-column format: . + - name: --seq_info + alternatives: -s + type: file + direction: input + description: | + is a tab-delimited file providing this info for each of the mapped + sequences: (useful for --description option with + mRNA/EST/protein mappings). + - name: --genome + alternatives: -g + type: file + description: | + Full path to a multi-fasta file with the genomic sequences for all input mappings, + OR a directory with single-fasta files (one per genomic sequence, with file names + matching sequence names). + example: genome.fa + - name: Outputs + arguments: + - name: --outfile + alternatives: -o + type: file + direction: output + must_exist: false + required: true + description: | + Write the output records into . + default: output.gff + - name: --force_exons + type: boolean_true + description: | + Make sure that the lowest level GFF features are considered "exon" features. + - name: --gene2exon + type: boolean_true + description: | + For single-line genes not parenting any transcripts, add an exon feature spanning + the entire gene (treat it as a transcript). + - name: --t_adopt + type: boolean_true + description: | + Try to find a parent gene overlapping/containing a transcript that does not have + any explicit gene Parent. + - name: --decode + alternatives: -D + type: boolean_true + description: | + Decode url encoded characters within attributes. + - name: --merge_exons + alternatives: -Z + type: boolean_true + description: | + Merge very close exons into a single exon (when intron size<4). + - name: --junctions + alternatives: -j + type: boolean_true + description: | + Output the junctions and the corresponding transcripts. + - name: --spliced_exons + alternatives: -w + type: file + direction: output + must_exist: false + description: | + Write a fasta file with spliced exons for each transcript. + example: exons.fa + - name: --w_add + type: integer + description: | + For the --spliced_exons option, extract additional bases both upstream and + downstream of the transcript boundaries. + - name: --w_nocds + type: boolean_true + description: | + For --spliced_exons, disable the output of CDS info in the FASTA file. + - name: --spliced_cds + alternatives: -x + type: file + must_exist: false + example: cds.fa + description: | + Write a fasta file with spliced CDS for each GFF transcript. + - name: --tr_cds + alternatives: -y + type: file + must_exist: false + example: tr_cds.fa + description: | + Write a protein fasta file with the translation of CDS for each record. + - name: --w_coords + alternatives: -W + type: boolean_true + description: | + For --spliced_exons, --spliced_cds and -tr_cds options, write in the FASTA defline + all the exon coordinates projected onto the spliced sequence. + - name: --stop_dot + alternatives: -S + type: boolean_true + description: | + For --tr_cds option, use '*' instead of '.' as stop codon translation. + - name: --id_version + alternatives: -L + type: boolean_true + description: | + Ensembl GTF to GFF3 conversion, adds version to IDs. + - name: --trackname + alternatives: -t + type: string + description: | + Use in the 2nd column of each GFF/GTF output line. + - name: --gtf_output + alternatives: -T + type: boolean_true + description: | + Main output will be GTF instead of GFF3. + - name: --bed + type: boolean_true + description: | + Output records in BED format instead of default GFF3. + - name: --tlf + type: boolean_true + description: | + Output "transcript line format" which is like GFF but with exons and CDS related + features stored as GFF attributes in the transcript feature line, like this: + exoncount=N;exons=;CDSphase=;CDS= + is a comma-delimited list of exon_start-exon_end coordinates; + is CDS_start:CDS_end coordinates or a list like . + - name: --table + type: string + multiple: true + multiple_sep: "," + description: | + Output a simple tab delimited format instead of GFF, with columns having the values + of GFF attributes given in ; special pseudo-attributes (prefixed by @) are + recognized: + @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen + If any of --spliced_exons/--tr_cds/--spliced_cds FASTA output files are enabled, the + same fields (excluding @id) are appended to the definition line of corresponding FASTA + records. + - name: --expose_dups + type: boolean_true + alternatives: [-E, -v] + description: | + Expose (warn about) duplicate transcript IDs and other potential problems with the + given GFF/GTF records. + - name: Options + arguments: + - name: --ids + type: file + description: | + Discard records/transcripts if their IDs are not listed in . + - name: --nids + type: file + description: | + Discard records/transcripts if their IDs are listed in . + - name: --maxintron + alternatives: -i + type: integer + description: | + Discard transcripts having an intron larger than . + - name: --minlen + alternatives: -l + type: integer + description: | + Discard transcripts shorter than bases. + - name: --range + alternatives: -r + type: string + description: | + Only show transcripts overlapping coordinate range .. (on chromosome/contig + , strand if provided). + - name: --strict_range + alternatives: -R + type: boolean_true + description: | + For --range option, discard all transcripts that are not fully contained within the given + range. + - name: --jmatch + type: string + description: | + Only output transcripts matching the given junction. + - name: --no_single_exon + alternatives: -U + type: boolean_true + description: | + Discard single-exon transcripts. + - name: --coding + alternatives: -C + type: boolean_true + description: | + Coding only: discard mRNAs that have no CDS features. + - name: --nc + type: boolean_true + description: | + Non-coding only: discard mRNAs that have CDS features. + - name: --ignore_locus + type: boolean_true + description: | + Discard locus features and attributes found in the input. + - name: --description + alternatives: -A + type: boolean_true + description: | + Use the description field from and add it as the value for a 'descr' + attribute to the GFF record. + + - name: Sorting + arguments: + - name: --sort_alpha + type: boolean_true + description: | + Chromosomes (reference sequences) are sorted alphabetically. + - name: --sort_by + type: file + must_exist: true + description: | + Sort the reference sequences by the order in which their names are given in the + file. + - name: Misc options + arguments: + - name: --keep_attrs + alternatives: -F + type: boolean_true + description: | + Keep all GFF attributes (for non-exon features). + - name: --keep_exon_attrs + type: boolean_true + description: | + For -F option, do not attempt to reduce redundant exon/CDS attributes. + - name: --no_exon_attrs + alternatives: -G + type: boolean_true + description: | + Do not keep exon attributes, move them to the transcript feature (for GFF3 output). + - name: --attrs + type: string + description: | + Only output the GTF/GFF attributes listed in which is a comma delimited + list of attribute names to. + - name: --keep_genes + type: boolean_true + description: | + In transcript-only mode (default), also preserve gene records. + - name: --keep_comments + type: boolean_true + description: | + For GFF3 input/output, try to preserve comments. + - name: --process_other + alternatives: -O + type: boolean_true + description: | + process other non-transcript GFF records (by default non-transcript records are ignored). + - name: --rm_stop_codons + alternatives: -V + type: boolean_true + description: | + Discard any mRNAs with CDS having in-frame stop codons (requires --genome). + - name: --adj_cds_start + alternatives: -H + type: boolean_true + description: | + For --rm_stop_codons option, check and adjust the starting CDS phase if the original phase + leads to a translation with an in-frame stop codon. + - name: --opposite_strand + alternatives: -B + type: boolean_true + description: | + For -V option, single-exon transcripts are also checked on the opposite strand (requires + --genome). + - name: --coding_status + alternatives: -P + type: boolean_true + description: | + Add transcript level GFF attributes about the coding status of each transcript, including + partialness or in-frame stop codons (requires --genome). + - name: --add_hasCDS + type: boolean_true + description: | + Add a "hasCDS" attribute with value "true" for transcripts that have CDS features. + - name: --adj_stop + type: boolean_true + description: | + Stop codon adjustment: enables --coding_status and performs automatic adjustment of the CDS stop + coordinate if premature or downstream. + - name: --rm_noncanon + alternatives: -N + type: boolean_true + description: | + Discard multi-exon mRNAs that have any intron with a non-canonical splice site consensus + (i.e. not GT-AG, GC-AG or AT-AC). + - name: --complete_cds + alternatives: -J + type: boolean_true + description: | + Discard any mRNAs that either lack initial START codon or the terminal STOP codon, or + have an in-frame stop codon (i.e. only print mRNAs with a complete CDS). + - name: --no_pseudo + type: boolean_true + description: | + Filter out records matching the 'pseudo' keyword. + - name: --in_bed + type: boolean_true + description: | + Input should be parsed as BED format (automatic if the input filename ends with .bed*). + - name: --in_tlf + type: boolean_true + description: | + Input GFF-like one-line-per-transcript format without exon/CDS features (see --tlf option + below); automatic if the input filename ends with .tlf). + - name: --stream + type: boolean_true + description: | + Fast processing of input GFF/BED transcripts as they are received (no sorting, exons must + be grouped by transcript in the input data). + + - name: Clustering + arguments: + - name: --merge + alternatives: -M + type: boolean_true + description: | + Cluster the input transcripts into loci, discarding "redundant" transcripts (those with + the same exact introns and fully contained or equal boundaries). + - name: --dupinfo + alternatives: -d + type: file + description: | + For --merge option, write duplication info to file . + - name: --cluster_only + type: boolean_true + description: | + Same as --merge but without discarding any of the "duplicate" transcripts, only create + "locus" features. + - name: --rm_redundant + alternatives: -K + type: boolean_true + description: | + For --merge option: also discard as redundant the shorter, fully contained transcripts (intron + chains matching a part of the container). + - name: --no_boundary + alternatives: -Q + type: boolean_true + description: | + For --merge option, no longer require boundary containment when assessing redundancy (can be + combined with --rm_redundant); only introns have to match for multi-exon transcripts, and >=80% + overlap for single-exon transcripts. + - name: --no_overlap + alternatives: -Y + type: boolean_true + description: | + For --merge option, enforce --no_boundary but also discard overlapping single-exon transcripts, + even on the opposite strand (can be combined with --rm_redudant). + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: +- type: docker + image: quay.io/biocontainers/gffread:0.12.7--hdcf5f25_3 + setup: + - type: docker + run: | + echo "gffread: \"$(gffread --version 2>&1)\"" > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/gffread/help.txt b/src/gffread/help.txt new file mode 100644 index 00000000..f9991c71 --- /dev/null +++ b/src/gffread/help.txt @@ -0,0 +1,140 @@ +```sh +gffread --help +``` + +gffread v0.12.7. Usage: +gffread [-g | ] [-s ] + [-o ] [-t ] [-r []:- [-R]] + [--jmatch :-] [--no-pseudo] + [-CTVNJMKQAFPGUBHZWTOLE] [-w ] [-x ] [-y ] + [-j ][--ids | --nids ] [--attrs ] [-i ] + [--stream] [--bed | --gtf | --tlf] [--table ] [--sort-by ] + [] + + Filter, convert or cluster GFF/GTF/BED records, extract the sequence of + transcripts (exon or CDS) and more. + By default (i.e. without -O) only transcripts are processed, discarding any + other non-transcript features. Default output is a simplified GFF3 with only + the basic attributes. + +Options: + --ids discard records/transcripts if their IDs are not listed in + --nids discard records/transcripts if their IDs are listed in + -i discard transcripts having an intron larger than + -l discard transcripts shorter than bases + -r only show transcripts overlapping coordinate range .. + (on chromosome/contig , strand if provided) + -R for -r option, discard all transcripts that are not fully + contained within the given range + --jmatch only output transcripts matching the given junction + -U discard single-exon transcripts + -C coding only: discard mRNAs that have no CDS features + --nc non-coding only: discard mRNAs that have CDS features + --ignore-locus : discard locus features and attributes found in the input + -A use the description field from and add it + as the value for a 'descr' attribute to the GFF record + -s is a tab-delimited file providing this info + for each of the mapped sequences: + + (useful for -A option with mRNA/EST/protein mappings) +Sorting: (by default, chromosomes are kept in the order they were found) + --sort-alpha : chromosomes (reference sequences) are sorted alphabetically + --sort-by : sort the reference sequences by the order in which their + names are given in the file +Misc options: + -F keep all GFF attributes (for non-exon features) + --keep-exon-attrs : for -F option, do not attempt to reduce redundant + exon/CDS attributes + -G do not keep exon attributes, move them to the transcript feature + (for GFF3 output) + --attrs only output the GTF/GFF attributes listed in + which is a comma delimited list of attribute names to + --keep-genes : in transcript-only mode (default), also preserve gene records + --keep-comments: for GFF3 input/output, try to preserve comments + -O process other non-transcript GFF records (by default non-transcript + records are ignored) + -V discard any mRNAs with CDS having in-frame stop codons (requires -g) + -H for -V option, check and adjust the starting CDS phase + if the original phase leads to a translation with an + in-frame stop codon + -B for -V option, single-exon transcripts are also checked on the + opposite strand (requires -g) + -P add transcript level GFF attributes about the coding status of each + transcript, including partialness or in-frame stop codons (requires -g) + --add-hasCDS : add a "hasCDS" attribute with value "true" for transcripts + that have CDS features + --adj-stop stop codon adjustment: enables -P and performs automatic + adjustment of the CDS stop coordinate if premature or downstream + -N discard multi-exon mRNAs that have any intron with a non-canonical + splice site consensus (i.e. not GT-AG, GC-AG or AT-AC) + -J discard any mRNAs that either lack initial START codon + or the terminal STOP codon, or have an in-frame stop codon + (i.e. only print mRNAs with a complete CDS) + --no-pseudo: filter out records matching the 'pseudo' keyword + --in-bed: input should be parsed as BED format (automatic if the input + filename ends with .bed*) + --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS + features (see --tlf option below); automatic if the input + filename ends with .tlf) + --stream: fast processing of input GFF/BED transcripts as they are received + ((no sorting, exons must be grouped by transcript in the input data) +Clustering: + -M/--merge : cluster the input transcripts into loci, discarding + "redundant" transcripts (those with the same exact introns + and fully contained or equal boundaries) + -d : for -M option, write duplication info to file + --cluster-only: same as -M/--merge but without discarding any of the + "duplicate" transcripts, only create "locus" features + -K for -M option: also discard as redundant the shorter, fully contained + transcripts (intron chains matching a part of the container) + -Q for -M option, no longer require boundary containment when assessing + redundancy (can be combined with -K); only introns have to match for + multi-exon transcripts, and >=80% overlap for single-exon transcripts + -Y for -M option, enforce -Q but also discard overlapping single-exon + transcripts, even on the opposite strand (can be combined with -K) +Output options: + --force-exons: make sure that the lowest level GFF features are considered + "exon" features + --gene2exon: for single-line genes not parenting any transcripts, add an + exon feature spanning the entire gene (treat it as a transcript) + --t-adopt: try to find a parent gene overlapping/containing a transcript + that does not have any explicit gene Parent + -D decode url encoded characters within attributes + -Z merge very close exons into a single exon (when intron size<4) + -g full path to a multi-fasta file with the genomic sequences + for all input mappings, OR a directory with single-fasta files + (one per genomic sequence, with file names matching sequence names) + -j output the junctions and the corresponding transcripts + -w write a fasta file with spliced exons for each transcript + --w-add for the -w option, extract additional bases + both upstream and downstream of the transcript boundaries + --w-nocds for -w, disable the output of CDS info in the FASTA file + -x write a fasta file with spliced CDS for each GFF transcript + -y write a protein fasta file with the translation of CDS for each record + -W for -w, -x and -y options, write in the FASTA defline all the exon + coordinates projected onto the spliced sequence; + -S for -y option, use '*' instead of '.' as stop codon translation + -L Ensembl GTF to GFF3 conversion, adds version to IDs + -m is a name mapping table for converting reference + sequence names, having this 2-column format: + + -t use in the 2nd column of each GFF/GTF output line + -o write the output records into instead of stdout + -T main output will be GTF instead of GFF3 + --bed output records in BED format instead of default GFF3 + --tlf output "transcript line format" which is like GFF + but with exons and CDS related features stored as GFF + attributes in the transcript feature line, like this: + exoncount=N;exons=;CDSphase=;CDS= + is a comma-delimited list of exon_start-exon_end coordinates; + is CDS_start:CDS_end coordinates or a list like + --table output a simple tab delimited format instead of GFF, with columns + having the values of GFF attributes given in ; special + pseudo-attributes (prefixed by @) are recognized: + @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, + @cds, @covlen, @cdslen + If any of -w/-y/-x FASTA output files are enabled, the same fields + (excluding @id) are appended to the definition line of corresponding + FASTA records + -v,-E expose (warn about) duplicate transcript IDs and other potential + problems with the given GFF/GTF records \ No newline at end of file diff --git a/src/gffread/script.sh b/src/gffread/script.sh new file mode 100644 index 00000000..26986fd1 --- /dev/null +++ b/src/gffread/script.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# unset flags +[[ "$par_coding" == "false" ]] && unset par_coding +[[ "$par_strict_range" == "false" ]] && unset par_strict_range +[[ "$par_no_single_exon" == "false" ]] && unset par_no_single_exon +[[ "$par_no_exon_attrs" == "false" ]] && unset par_no_exon_attrs +[[ "$par_nc" == "false" ]] && unset par_nc +[[ "$par_ignore_locus" == "false" ]] && unset par_ignore_locus +[[ "$par_description" == "false" ]] && unset par_description +[[ "$par_sort_alpha" == "false" ]] && unset par_sort_alpha +[[ "$par_keep_genes" == "false" ]] && unset par_keep_genes +[[ "$par_keep_attrs" == "false" ]] && unset par_keep_attrs +[[ "$par_keep_exon_attrs" == "false" ]] && unset par_keep_exon_attrs +[[ "$par_keep_comments" == "false" ]] && unset par_keep_comments +[[ "$par_process_other" == "false" ]] && unset par_process_other +[[ "$par_rm_stop_codons" == "false" ]] && unset par_rm_stop_codons +[[ "$par_adj_cds_start" == "false" ]] && unset par_adj_cds_start +[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand +[[ "$par_coding_status" == "false" ]] && unset par_coding_status +[[ "$par_add_hasCDS" == "false" ]] && unset par_add_hasCDS +[[ "$par_adj_stop" == "false" ]] && unset par_adj_stop +[[ "$par_rm_noncanon" == "false" ]] && unset par_rm_noncanon +[[ "$par_complete_cds" == "false" ]] && unset par_complete_cds +[[ "$par_no_pseudo" == "false" ]] && unset par_no_pseudo +[[ "$par_in_bed" == "false" ]] && unset par_in_bed +[[ "$par_in_tlf" == "false" ]] && unset par_in_tlf +[[ "$par_stream" == "false" ]] && unset par_stream +[[ "$par_merge" == "false" ]] && unset par_merge +[[ "$par_rm_redundant" == "false" ]] && unset par_rm_redundant +[[ "$par_no_boundary" == "false" ]] && unset par_no_boundary +[[ "$par_no_overlap" == "false" ]] && unset par_no_overlap +[[ "$par_force_exons" == "false" ]] && unset par_force_exons +[[ "$par_gene2exon" == "false" ]] && unset par_gene2exon +[[ "$par_t_adopt" == "false" ]] && unset par_t_adopt +[[ "$par_decode" == "false" ]] && unset par_decode +[[ "$par_merge_exons" == "false" ]] && unset par_merge_exons +[[ "$par_junctions" == "false" ]] && unset par_junctions +[[ "$par_w_nocds" == "false" ]] && unset par_w_nocds +[[ "$par_tr_cds" == "false" ]] && unset par_tr_cds +[[ "$par_w_coords" == "false" ]] && unset par_w_coords +[[ "$par_stop_dot" == "false" ]] && unset par_stop_dot +[[ "$par_id_version" == "false" ]] && unset par_id_version +[[ "$par_gtf_output" == "false" ]] && unset par_gtf_output +[[ "$par_bed" == "false" ]] && unset par_bed +[[ "$par_tlf" == "false" ]] && unset par_tlf +[[ "$par_expose_dups" == "false" ]] && unset par_expose_dups +[[ "$par_cluster_only" == "false" ]] && unset par_cluster_only + + +gffread \ + "$par_input" \ + ${par_chr_mapping:+-m "$par_chr_mapping"} \ + ${par_seq_info:+-s "$par_seq_info"} \ + -o "$par_outfile" \ + ${par_force_exons:+--force-exons} \ + ${par_gene2exon:+--gene2exon} \ + ${par_t_adopt:+--t-adopt} \ + ${par_decode:+-D} \ + ${par_merge_exons:+-Z} \ + ${par_genome:+-g "$par_genome"} \ + ${par_junctions:+-j} \ + ${par_spliced_exons:+-w "$par_spliced_exons"} \ + ${par_w_add:+--w-add "$par_w_add"} \ + ${par_w_nocds:+--w-nocds} \ + ${par_spliced_cds:+-x "$par_spliced_cds"} \ + ${par_tr_cds:+-y "$par_tr_cds"} \ + ${par_w_coords:+-W} \ + ${par_stop_dot:+-S} \ + ${par_id_version:+-L} \ + ${par_trackname:+-t "$par_trackname"} \ + ${par_gtf_output:+-T} \ + ${par_bed:+--bed} \ + ${par_tlf:+--tlf} \ + ${par_table:+--table "$par_table"} \ + ${par_expose_dups:+-E} \ + ${par_ids:+--ids "$par_ids"} \ + ${par_nids:+--nids "$par_nids"} \ + ${par_maxintron:+-i "$par_maxintron"} \ + ${par_minlen:+-l "$par_minlen"} \ + ${par_range:+-r "$par_range"} \ + ${par_strict_range:+-R} \ + ${par_jmatch:+--jmatch "$par_jmatch"} \ + ${par_no_single_exon:+-U} \ + ${par_coding:+-C} \ + ${par_nc:+--nc} \ + ${par_ignore_locus:+--ignore-locus} \ + ${par_description:+-A} \ + ${par_sort_alpha:+--sort-alpha} \ + ${par_sort_by:+--sort-by "$par_sort_by"} \ + ${par_keep_attrs:+-F} \ + ${par_keep_exon_attrs:+--keep-exon-attrs} \ + ${par_no_exon_attrs:+-G} \ + ${par_attrs:+--attrs "$par_attrs"} \ + ${par_keep_genes:+--keep-genes} \ + ${par_keep_comments:+--keep-comments} \ + ${par_process_other:+-O} \ + ${par_rm_stop_codons:+-V} \ + ${par_adj_cds_start:+-H} \ + ${par_opposite_strand:+-B} \ + ${par_coding_status:+-P} \ + ${par_add_hasCDS:+--add-hasCDS} \ + ${par_adj_stop:+--adj-stop} \ + ${par_rm_noncanon:+-N} \ + ${par_complete_cds:+-J} \ + ${par_no_pseudo:+--no-pseudo} \ + ${par_in_bed:+--in-bed} \ + ${par_in_tlf:+--in-tlf} \ + ${par_stream:+--stream} \ + ${par_merge:+-M} \ + ${par_dupinfo:+-d "$par_dupinfo"} \ + ${par_cluster_only:+--cluster-only} \ + ${par_rm_redundant:+-K} \ + ${par_no_boundary:+-Q} \ + ${par_no_overlap:+-Y} + diff --git a/src/gffread/test.sh b/src/gffread/test.sh new file mode 100755 index 00000000..6d38012b --- /dev/null +++ b/src/gffread/test.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_output_dir="${meta_resources_dir}test_data/test_output" +test_dir="${meta_resources_dir}test_data" +expected_output_dir="${meta_resources_dir}test_data/output" + +mkdir -p "$test_output_dir" + + +################################################################################ + +echo "> Test 1 - Read annotation file, output GFF" + +"$meta_executable" \ + --expose_dups \ + --outfile "$test_output_dir/ann_simple.gff" \ + --input "$test_dir/sequence.gff3" + + +echo ">> Check if output exists" +[ ! -f "$test_output_dir/ann_simple.gff" ] \ + && echo "Output file test_output/ann_simple.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$test_output_dir/ann_simple.gff" ] \ + && echo "Output file test_output/ann_simple.gff is empty" && exit 1 + +echo ">> Compare output to expected output" + +# compare file expect lines starting with "#" +diff <(grep -v "^#" "$expected_output_dir/ann_simple.gff") \ + <(grep -v "^#" "$test_output_dir/ann_simple.gff") || \ + (echo "Output file ann_simple.gff does not match expected output" && exit 1) + +################################################################################ + +echo "> Test 2 - Read annotation file, output GTF" + +"$meta_executable" \ + --gtf_output \ + --outfile "$test_output_dir/annotation.gtf" \ + --input "$test_dir/sequence.gff3" + +echo ">> Check if output exists" +[ ! -f "$test_output_dir/annotation.gtf" ] \ + && echo "Output file test_output/annotation.gtf does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$test_output_dir/annotation.gtf" ] \ + && echo "Output file test_output/annotation.gtf is empty" && exit 1 + +echo ">> Compare output to expected output" +diff "$expected_output_dir/annotation.gtf" "$test_output_dir/annotation.gtf" || \ + (echo "Output file annotation.gtf does not match expected output" && exit 1) + +################################################################################ + +echo "> Test 3 - Generate fasta file from annotation file" + + +"$meta_executable" \ + --genome "$test_dir/sequence.fasta" \ + --spliced_exons "$test_output_dir/transcripts.fa" \ + --outfile "$test_output_dir/output.gff" \ + --input "$test_dir/sequence.gff3" + +echo ">> Check if output exists" +[ ! -f "$test_output_dir/transcripts.fa" ] \ + && echo "Output file transcripts.fa does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$test_output_dir/transcripts.fa" ] \ + && echo "Output file transcripts.fa is empty" && exit 1 + +echo ">> Compare output to expected output" +diff "$expected_output_dir/transcripts.fa" "$test_output_dir/transcripts.fa" || \ + (echo "Output file transcripts.fa does not match expected output" && exit 1) + +################################################################################ + +echo "> Test 4 - Generate table from GFF annotation file" + +"$meta_executable" \ + --table @id,@chr,@start,@end,@strand,@exons,Name,gene,product \ + --outfile "$test_output_dir/annotation.tbl" \ + --input "$test_dir/sequence.gff3" + +echo ">> Check if output exists" +[ ! -f "$test_output_dir/annotation.tbl" ] \ + && echo "Output file test_output/annotation.tbl does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$test_output_dir/annotation.tbl" ] \ + && echo "Output file test_output/annotation.tbl is empty" && exit 1 + +echo ">> Compare output to expected output" +diff "$expected_output_dir/annotation.tbl" "$test_output_dir/annotation.tbl" || \ + (echo "Output file annotation.tbl does not match expected output" && exit 1) + +################################################################################ + +rm -r "$test_output_dir" + +echo "> All tests successful" + +exit 0 \ No newline at end of file diff --git a/src/gffread/test_data/README.md b/src/gffread/test_data/README.md new file mode 100644 index 00000000..f1638b95 --- /dev/null +++ b/src/gffread/test_data/README.md @@ -0,0 +1,38 @@ +## GffRead usage examples + +GffRead can be used to simply read an annotation file in a GFF format, and print it in either GFF3 (default) or +GTF2 format (with the -T option), while discarding any non-trasncript features and optional attributes. +It can also report some potential issues found in the input GFF records. The command line for such a quick GFF/GTF +file cleanup would be: +``` +gffread -E annotation.gff -o ann_simple.gff +``` + +This will create a minimalist GFF3 re-formatting of the transcript records found in the input file (`annotation.gff` in this example). +The -E option directs GffRead to "expose" (display warnings about) any potential formatting issues +encountered while parsing the input file. + +In order to obtain the GTF2 version of the same transcript records, the `-T` option should be added: +``` +gffread annotation.gff -T -o annotation.gtf +``` + +GffRead can be used to generate a FASTA file with the DNA sequences for all transcripts in a GFF file. For this operation +a fasta file with the genomic sequences has to be provided as well. This can be accomplished with a command line like this: +``` +gffread -w transcripts.fa -g genome.fa annotation.gff +``` +The file `genome.fa` in this example would be a multi-fasta file with the chromosome/contig sequences of the target genome. +This also requires that every contig or chromosome name found in the 1st column of the input GFF file +(`annotation.gff` in this example) must have a corresponding sequence entry in the `genome.fa` file. + + +``` +gffread --table @id,@chr,@start,@end,@strand,@exons,Name,gene,product \ + -o annotation.tbl annotation.gff +``` +This shows how the `--table` option can make a tab delimited table out of a GFF3 input. + +The `output` directory contains all the output files that should be generated by the above examples. + + diff --git a/src/gffread/test_data/output/ann_simple.gff b/src/gffread/test_data/output/ann_simple.gff new file mode 100644 index 00000000..c8e5e933 --- /dev/null +++ b/src/gffread/test_data/output/ann_simple.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# gffread v0.12.7 +# gffread -E -o output/ann_simple.gff sequence.gff3 +NM_141699.3 RefSeq gene 22 795 . + . ID=gene-Dmel_CG16905;gene_name=eloF +NM_141699.3 RefSeq CDS 22 795 . + 0 Parent=gene-Dmel_CG16905 diff --git a/src/gffread/test_data/output/annotation.gtf b/src/gffread/test_data/output/annotation.gtf new file mode 100644 index 00000000..7e203137 --- /dev/null +++ b/src/gffread/test_data/output/annotation.gtf @@ -0,0 +1,2 @@ +NM_141699.3 RefSeq transcript 22 795 . + . transcript_id "gene-Dmel_CG16905"; gene_id "gene-Dmel_CG16905"; gene_name "eloF" +NM_141699.3 RefSeq CDS 22 795 . + 0 transcript_id "gene-Dmel_CG16905"; gene_name "eloF"; diff --git a/src/gffread/test_data/output/annotation.tbl b/src/gffread/test_data/output/annotation.tbl new file mode 100644 index 00000000..15a5c0fd --- /dev/null +++ b/src/gffread/test_data/output/annotation.tbl @@ -0,0 +1 @@ +gene-Dmel_CG16905 NM_141699.3 22 795 + 22-795 eloF eloF elongase F diff --git a/src/gffread/test_data/output/transcripts.fa b/src/gffread/test_data/output/transcripts.fa new file mode 100644 index 00000000..889ebec9 --- /dev/null +++ b/src/gffread/test_data/output/transcripts.fa @@ -0,0 +1,13 @@ +>gene-Dmel_CG16905 CDS=1-774 +ATGTTCGCTCCGATAGATCCTGTAAAGATACCCGTTGTAAGCAATCCATGGATAACCATGGGCACATTGA +TTGGCTATCTGCTGTTTGTGCTCAAGCTGGGCCCCAAAATCATGGAGCACCGAAAGCCCTTCCATTTGAA +TGGCGTCATCAGGATCTACAACATATTCCAGATCCTTTACAATGGTCTAATACTCGTTTTAGGAGTTCAC +TTCCTGTTTGTCCTGAAAGCCTACCAAATCAGTTGCATTGTTAGCCTGCCGATGGATCACAAATATAAGG +ATAGAGAGCGTTTGATTTGCACTTTGTACCTGGTGAACAAATTCGTAGACCTTGTGGAAACCATTTTCTT +TGTGCTCCGCAAAAAGGACAGACAGATATCCTTCCTGCACGTCTTCCATCATTTTGCGATGGCATTTTTT +GGATATCTCTACTACTGCTTCCACGGATACGGTGGCGTTGCCTTTCCACAGTGCCTGCTAAACACCGCCG +TCCACGTGATTATGTACGCCTACTACTATCTATCCTCGATCAGCAAGGAGGTGCAGAGAAGTCTCTGGTG +GAAGAAATACATCACAATTGCTCAGCTGGTCCAGTTCGCCATTATTCTGCTCCACTGTACCATCACGCTG +GCACAGCCCAACTGCGCGGTCAACAGACCCTTGACCTACGGATGCGGATCGCTTTCAGCGTTTTTTGCAG +TGATATTTAGCCAATTTTATTACCACAACTACATAAAGCCAGGAAAGAAGTCAGCGAAACAAAACAAAAA +TTAA diff --git a/src/gffread/test_data/script.sh b/src/gffread/test_data/script.sh new file mode 100755 index 00000000..0c6e725c --- /dev/null +++ b/src/gffread/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/gffread_source ]; then + git clone --depth 2 --single-branch --branch master https://github.com/gpertea/gffread.git /tmp/gffread_source +fi + +# copy test data +cp -r /tmp/gffread_source/examples/* src/gffread/test_data diff --git a/src/gffread/test_data/sequence.fasta b/src/gffread/test_data/sequence.fasta new file mode 100644 index 00000000..31ec0f04 --- /dev/null +++ b/src/gffread/test_data/sequence.fasta @@ -0,0 +1,16 @@ +>NM_141699.3 Drosophila melanogaster elongase F (eloF), mRNA +CACAACTCGATTAGATTCGCCATGTTCGCTCCGATAGATCCTGTAAAGATACCCGTTGTAAGCAATCCAT +GGATAACCATGGGCACATTGATTGGCTATCTGCTGTTTGTGCTCAAGCTGGGCCCCAAAATCATGGAGCA +CCGAAAGCCCTTCCATTTGAATGGCGTCATCAGGATCTACAACATATTCCAGATCCTTTACAATGGTCTA +ATACTCGTTTTAGGAGTTCACTTCCTGTTTGTCCTGAAAGCCTACCAAATCAGTTGCATTGTTAGCCTGC +CGATGGATCACAAATATAAGGATAGAGAGCGTTTGATTTGCACTTTGTACCTGGTGAACAAATTCGTAGA +CCTTGTGGAAACCATTTTCTTTGTGCTCCGCAAAAAGGACAGACAGATATCCTTCCTGCACGTCTTCCAT +CATTTTGCGATGGCATTTTTTGGATATCTCTACTACTGCTTCCACGGATACGGTGGCGTTGCCTTTCCAC +AGTGCCTGCTAAACACCGCCGTCCACGTGATTATGTACGCCTACTACTATCTATCCTCGATCAGCAAGGA +GGTGCAGAGAAGTCTCTGGTGGAAGAAATACATCACAATTGCTCAGCTGGTCCAGTTCGCCATTATTCTG +CTCCACTGTACCATCACGCTGGCACAGCCCAACTGCGCGGTCAACAGACCCTTGACCTACGGATGCGGAT +CGCTTTCAGCGTTTTTTGCAGTGATATTTAGCCAATTTTATTACCACAACTACATAAAGCCAGGAAAGAA +GTCAGCGAAACAAAACAAAAATTAACTAAATTTAAACTAAATCATGAGTACAAAGCCTAAAGATTCGTGA +AGCAACAATAGCCACAGCCTATTTTTGAATATTTCATATATGATTTTATGGGGTAAATGAATTAAAAAAC +ATTTGTTTTCTTGGCGTCAAACT + diff --git a/src/gffread/test_data/sequence.gff3 b/src/gffread/test_data/sequence.gff3 new file mode 100644 index 00000000..c6a77a7a --- /dev/null +++ b/src/gffread/test_data/sequence.gff3 @@ -0,0 +1,9 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +##sequence-region NM_141699.3 1 933 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7227 +NM_141699.3 RefSeq region 1 933 . + . ID=NM_141699.3:1..933;Dbxref=taxon:7227;Name=3R;chromosome=3R;gbkey=Src;genome=chromosome;genotype=y[1]%3B Gr22b[1] Gr22d[1] cn[1] CG33964[R4.2] bw[1] sp[1]%3B LysC[1] MstProx[1] GstD5[1] Rh6[1];mol_type=mRNA +NM_141699.3 RefSeq gene 1 933 . + . ID=gene-Dmel_CG16905;Dbxref=FLYBASE:FBgn0037762,GeneID:41211;Name=eloF;cyt_map=85E10-85E10;description=elongase F;gbkey=Gene;gen_map=3-49 cM;gene=eloF;gene_synonym=CG16905,Dmel\CG16905,EloF;locus_tag=Dmel_CG16905 +NM_141699.3 RefSeq CDS 22 795 . + 0 ID=cds-NP_649956.1;Parent=gene-Dmel_CG16905;Dbxref=FLYBASE:FBpp0081622,GeneID:41211,GenBank:NP_649956.1,FLYBASE:FBgn0037762;Name=NP_649956.1;gbkey=CDS;gene=eloF;locus_tag=Dmel_CG16905;orig_transcript_id=gnl|FlyBase|CG16905-RA;product=elongase F;protein_id=NP_649956.1 +