diff --git a/CHANGELOG.md b/CHANGELOG.md index 0370a216..b31f43d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # biobox x.x.x +## NEW FUNCTIONALITY + +* `agat`: + - `agat/agat_convert_genscan2gff`: convert a genscan file into a GFF file (PR #100). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/agat/agat_convert_genscan2gff/config.vsh.yaml b/src/agat/agat_convert_genscan2gff/config.vsh.yaml new file mode 100644 index 00000000..2adce1da --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/config.vsh.yaml @@ -0,0 +1,95 @@ +name: agat_convert_genscan2gff +namespace: agat +description: | + The script takes a GENSCAN file as input, and will translate it in gff + format. The GENSCAN format is described [here](http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/genscan.html). + + **Known problem** + + You must have submited only DNA sequence, without any header!! Indeed the tool expects only DNA + sequences and does not crash/warn if an header is submited along the + sequence. e.g If you have an header ">seq" s-e-q are seen as the 3 first + nucleotides of the sequence. Then all prediction location are shifted + accordingly. (checked only on the [online version](http://argonaute.mit.edu/GENSCAN.html). + I don't know if there is the same problem elsewhere.) +keywords: [gene annotations, GFF conversion, GENSCAN] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_genscan2gff.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +requirements: + - commands: [agat] +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --genscan + alternatives: [-g] + description: Input genscan bed file that will be converted. + type: file + required: true + direction: input + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gff] + description: Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gff + - name: Arguments + arguments: + - name: --source + description: | + The source informs about the tool used to produce the data and is stored in 2nd field of a gff file. Example: Stringtie, Maker, Augustus, etc. [default: data] + type: string + required: false + example: Stringtie + - name: --primary_tag + description: | + The primary_tag corresponds to the data type and is stored in 3rd field of a gff file. Example: gene, mRNA, CDS, etc. [default: gene] + type: string + required: false + example: gene + - name: --inflate_type + description: | + Feature type (3rd column in gff) created when inflate parameter activated [default: exon]. + type: string + required: false + example: exon + - name: --verbose + description: add verbosity + type: boolean_true + - name: --config + alternatives: [-c] + description: | + AGAT config file. By default AGAT takes the original agat_config.yaml shipped with AGAT. The `--config` option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_genscan2gff/help.txt b/src/agat/agat_convert_genscan2gff/help.txt new file mode 100644 index 00000000..8a9e9f52 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/help.txt @@ -0,0 +1,94 @@ +```sh +agat_convert_genscan2gff.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + +Name: + agat_convert_genscan2gff.pl + +Description: + The script takes a genscan file as input, and will translate it in gff + format. The genscan format is described here: + http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/gens + can.html /!\ vvv Known problem vvv /!\ You must have submited only DNA + sequence, wihtout any header!! Indeed the tool expects only DNA + sequences and does not crash/warn if an header is submited along the + sequence. e.g If you have an header ">seq" s-e-q are seen as the 3 first + nucleotides of the sequence. Then all prediction location are shifted + accordingly. (checked only on the online version + http://argonaute.mit.edu/GENSCAN.html. I don't know if there is the same + pronlem elsewhere.) /!\ ^^^ Known problem ^^^^ /!\ + +Usage: + agat_convert_genscan2gff.pl --genscan infile.bed [ -o outfile ] + agat_convert_genscan2gff.pl -h + +Options: + --genscan or -g + Input genscan bed file that will be convert. + + --source + The source informs about the tool used to produce the data and + is stored in 2nd field of a gff file. Example: + Stringtie,Maker,Augustus,etc. [default: data] + + --primary_tag + The primary_tag corresponf to the data type and is stored in 3rd + field of a gff file. Example: gene,mRNA,CDS,etc. [default: gene] + + --inflate_off + By default we inflate the block fields (blockCount, blockSizes, + blockStarts) to create subfeatures of the main feature + (primary_tag). Type of subfeature created based on the + inflate_type parameter. If you don't want this inflating + behaviour you can deactivate it by using the option + --inflate_off. + + --inflate_type + Feature type (3rd column in gff) created when inflate parameter + activated [default: exon]. + + --verbose + add verbosity + + -o , --output , --out , --outfile or --gff + Output GFF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md diff --git a/src/agat/agat_convert_genscan2gff/script.sh b/src/agat/agat_convert_genscan2gff/script.sh new file mode 100644 index 00000000..38afb084 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/script.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +# unset flags +[[ "$par_inflate_off" == "true" ]] && unset par_inflate_off +[[ "$par_verbose" == "false" ]] && unset par_verbose + +# run agat_convert_genscan2gff +agat_convert_genscan2gff.pl \ + --genscan "$par_genscan" \ + --output "$par_output" \ + ${par_source:+--source "${par_source}"} \ + ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \ + ${par_inflate_off:+--inflate_off} \ + ${par_inflate_type:+--inflate_type "${par_inflate_type}"} \ + ${par_verbose:+--verbose} \ + ${par_config:+--config "${par_config}"} \ No newline at end of file diff --git a/src/agat/agat_convert_genscan2gff/test.sh b/src/agat/agat_convert_genscan2gff/test.sh new file mode 100644 index 00000000..b666dacf --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" + +# create temporary directory and clean up on exit +TMPDIR=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -rf "$TMPDIR" +} +trap clean_up EXIT + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --genscan "$test_dir/test.genscan" \ + --output "$TMPDIR/output.gff" + +echo ">> Checking output" +[ ! -f "$TMPDIR/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$TMPDIR/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "$TMPDIR/output.gff" "$test_dir/agat_convert_genscan2gff_1.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" diff --git a/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff b/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff new file mode 100644 index 00000000..695fb46c --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff @@ -0,0 +1,25 @@ +##gff-version 3 +unknown genscan gene 2223 4605 75.25 + . ID=gene_1 +unknown genscan mRNA 2223 4605 75.25 + . ID=mrna_1;Parent=gene_1 +unknown genscan exon 2223 3020 75.25 + . ID=exon_1;Parent=mrna_1 +unknown genscan exon 4249 4605 13.03 + . ID=exon_2;Parent=mrna_1 +unknown genscan CDS 2223 3020 75.25 + 0 ID=cds_1;Parent=mrna_1 +unknown genscan CDS 4249 4605 13.03 + 0 ID=cds_2;Parent=mrna_1 +unknown genscan gene 6829 8789 20.06 - . ID=gene_2 +unknown genscan mRNA 6829 8789 20.06 - . ID=mrna_2;Parent=gene_2 +unknown genscan exon 6829 7297 20.06 - . ID=exon_3;Parent=mrna_2 +unknown genscan exon 7730 7888 12.78 - . ID=exon_4;Parent=mrna_2 +unknown genscan exon 8029 8185 7.45 - . ID=exon_5;Parent=mrna_2 +unknown genscan exon 8278 8546 17.45 - . ID=exon_6;Parent=mrna_2 +unknown genscan exon 8647 8789 18.65 - . ID=exon_7;Parent=mrna_2 +unknown genscan CDS 6829 7297 20.06 - 1 ID=cds_3;Parent=mrna_2 +unknown genscan CDS 7730 7888 12.78 - 1 ID=cds_4;Parent=mrna_2 +unknown genscan CDS 8029 8185 7.45 - 2 ID=cds_5;Parent=mrna_2 +unknown genscan CDS 8278 8546 17.45 - 1 ID=cds_6;Parent=mrna_2 +unknown genscan CDS 8647 8789 18.65 - 0 ID=cds_7;Parent=mrna_2 +unknown genscan gene 10209 11924 16.18 + . ID=gene_3 +unknown genscan mRNA 10209 11924 16.18 + . ID=mrna_3;Parent=gene_3 +unknown genscan exon 10209 11313 16.18 + . ID=exon_8;Parent=mrna_3 +unknown genscan exon 11850 11924 3.27 + . ID=exon_9;Parent=mrna_3 +unknown genscan CDS 10209 11313 16.18 + 0 ID=cds_8;Parent=mrna_3 +unknown genscan CDS 11850 11924 3.27 + 2 ID=cds_9;Parent=mrna_3 diff --git a/src/agat/agat_convert_genscan2gff/test_data/script.sh b/src/agat/agat_convert_genscan2gff/test_data/script.sh new file mode 100755 index 00000000..c1693653 --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/test.genscan src/agat/agat_convert_genscan2gff/test_data/test.genscan +cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_genscan2gff_1.gff src/agat/agat_convert_genscan2gff/test_data/agat_convert_genscan2gff_1.gff + diff --git a/src/agat/agat_convert_genscan2gff/test_data/test.genscan b/src/agat/agat_convert_genscan2gff/test_data/test.genscan new file mode 100644 index 00000000..a88037db --- /dev/null +++ b/src/agat/agat_convert_genscan2gff/test_data/test.genscan @@ -0,0 +1,127 @@ +GENSCAN 1.0 Date run: 7-Mar-120 Time: 14:46:49 + + + +Sequence /tmp/03_07_20-14:46:49.fasta : 12217 bp : 42.83% C+G : Isochore 1 ( 0 - 43 C+G%) + + + +Parameter matrix: HumanIso.smat + + + +Predicted genes/exons: + + + +Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr.. + +----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------ + + + + 1.01 Init + 2223 3020 798 2 0 55 2 924 0.940 75.25 + + 1.02 Term + 4249 4605 357 0 0 26 38 307 0.976 13.03 + + 1.03 PlyA + 4711 4716 6 -0.45 + + + + 2.06 PlyA - 4852 4847 6 -0.45 + + 2.05 Term - 7297 6829 469 0 1 13 42 387 0.281 20.06 + + 2.04 Intr - 7888 7730 159 0 0 85 93 144 0.998 12.78 + + 2.03 Intr - 8185 8029 157 2 1 65 60 144 0.787 7.45 + + 2.02 Intr - 8546 8278 269 1 2 36 65 287 0.946 17.45 + + 2.01 Init - 8789 8647 143 2 2 94 96 176 0.550 18.65 + + 2.00 Prom - 9720 9681 40 -6.55 + + + + 3.00 Prom + 10160 10199 40 -11.84 + + 3.01 Init + 10209 11313 1105 2 1 66 57 269 0.512 16.18 + + 3.02 Intr + 11850 11924 75 1 0 80 86 57 0.507 3.27 + + + +Suboptimal exons with probability > 1.000 + + + +Exnum Type S .Begin ...End .Len Fr Ph B/Ac Do/T CodRg P.... Tscr.. + +----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------ + + + +NO EXONS FOUND AT GIVEN PROBABILITY CUTOFF + + + + + +Predicted peptide sequence(s): + + + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_1|384_aa + +MSSKNKVSKQDIDSIVESLMKKQKSYFEPRLAQIQQVGMENVQKLSAIHAELALLTASIS + +TVKSDVDKLKCKVENNFSAIDGHDQAFGELELKMADMEDRSRRCNIRVIGLKERLEGFNA + +IQYLTHSLPKWFPALADVPVEVMSAHRIYSDAKRGDNRTLIFNVLRYTTRQAILRAAKKD + +PLSVDDRKVRFSPDYSNFTVKRCQAFHQAKDAARNKCLDFFLLYPATLKIKEGAQYRSFT + +SPKEAEDYVNSAASNHAATPASPRQHGTILTIYRRIHSLYDGERARKIQLLEQAASVALT + +GDNWTSVRNDNYLGVTAHFIDNVWKLRCFALEVKKKKKHSRHTAEDCAEEFIDVSNRWEI + +NGKLTTLGTDSALIMLAAARLLPF + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_2|398_aa + +MASTMPSSSSTEDEENTPECLNKDHYHFHHYTMEYIQDKPTNVARVGGFTDKKSIAKVER + +CLARERQEATEDHEAIPSTSGATSLTKKLRSRSGLPIAGSGLVLPALCIICQKKEKFINR + +AGKRQRDPLSKAETLTVGQLQKAAELKDDQSILLHIKDKDCVALEVQYHKGCYNQYTRFM + +TRPEKPEKEQNEPTFDVGYKILCERIIRQRLLVNQEVLRMGQLRMAFIELVKANEGLDAS + +NYSIKNLERSRRADAGSQRIQIFDPDQRTPTQWKKFLSEGTKKEALAEFLYVAWKNADLT + +IVGKNLCLYIAHTNQCHCVTVKEGVQSVRVVEDLLLFLHAQHAAREHKAVIIKSSDTDVA + +VIAVSVQTDLPCSLYVFTGTGNRTRIIDITKVSSANKI + + + +>/tmp/03_03_20-07:33:11.fasta|GENSCAN_predicted_peptide_3|394_aa + +MQRGRAAGINGIPPEFYVAFWEQLSPFFLHMINFSIEKGGFLRDVNTALISLLMKKDKNP + +TDCSSYRPLSLLNSDVKIFAKLLPLRLEPHMPELVSSDQTGFIKSRTAADNIRRLLHIIA + +AAPGCETPMSVLSLDAMKAFDRLEWSFLWSVLEAMGFISTFIGMVKVLYSNPSARVLTGQ + +TFSSLFPVSRSSRQGCPLSPALFVLSLEPLAQAVRLSNLVLPICICDTQHKLSLFADDVI + +VFLEHPTQSLPHFLSICEEFRKLSGFKMNWSKSALMHLNDNARKSVTPVNIPLVGQLKYL + +GIEVFPSLNQIVKHNYSLAFTNVLKDMDRWISLPMSIQARISIIKMNGLPRIHFVSSMVP + +LPPPSDYWIKISAQGVRCPLAKPFTHSPYSKTKX