From ede5850f577cbfe8ca5edf8525703535b12b4a36 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Sat, 10 Aug 2024 08:51:39 +0200 Subject: [PATCH] Add agat convert embl2gff (#99) * add config * add help * add test data and expected output * add script to get test data * add running script * add test script * update description * update changelog * cleanup * fix path to copy test data * pull the test data again * fix typo GTF => GFF * fix tests * fix output file: replace by generated output * fix test data: add --emblmygff3 * cleanup * config: add longer name to `-k` and `-d` --- CHANGELOG.md | 2 + .../agat_convert_embl2gff/config.vsh.yaml | 84 +++++++++++++++++++ src/agat/agat_convert_embl2gff/help.txt | 78 +++++++++++++++++ src/agat/agat_convert_embl2gff/script.sh | 23 +++++ src/agat/agat_convert_embl2gff/test.sh | 28 +++++++ .../test_data/agat_convert_embl2gff_1.embl | 51 +++++++++++ .../test_data/agat_convert_embl2gff_1.gff | 10 +++ .../agat_convert_embl2gff/test_data/script.sh | 10 +++ 8 files changed, 286 insertions(+) create mode 100644 src/agat/agat_convert_embl2gff/config.vsh.yaml create mode 100644 src/agat/agat_convert_embl2gff/help.txt create mode 100644 src/agat/agat_convert_embl2gff/script.sh create mode 100644 src/agat/agat_convert_embl2gff/test.sh create mode 100644 src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl create mode 100644 src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff create mode 100755 src/agat/agat_convert_embl2gff/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 9dd2389c..3c2f347a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). +* `agat/agat_convert_embl2gff`: convert an EMBL file into GFF format (PR #99). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/agat/agat_convert_embl2gff/config.vsh.yaml b/src/agat/agat_convert_embl2gff/config.vsh.yaml new file mode 100644 index 00000000..99ceec46 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/config.vsh.yaml @@ -0,0 +1,84 @@ +name: agat_convert_embl2gff +namespace: agat +description: | + The script takes an EMBL file as input, and will translate it in gff format. +keywords: [gene annotations, GFF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_embl2gff.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --embl + description: Input EMBL file that will be read. + type: file + required: true + direction: input + example: input.embl + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gff] + description: Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: false + example: output.gff + - name: Arguments + arguments: + - name: --emblmygff3 + description: | + Means that the EMBL flat file comes from the EMBLmyGFF3 software. This is an EMBL format dedicated for submission and contains particularity to deal with. This parameter is needed to get a proper sequence id in the GFF3 from an embl made with EMBLmyGFF3. + type: boolean_true + - name: --primary_tag + alternatives: [--pt, -t] + description: | + List of "primary tag". Useful to discard or keep specific features. Multiple tags must be comma-separated. + type: string + multiple: true + required: false + example: [tag1, tag2] + - name: --discard + alternatives: [-d] + description: | + Means that primary tags provided by the option "primary_tag" will be discarded. + type: boolean_true + - name: --keep + alternatives: [-k] + description: | + Means that only primary tags provided by the option "primary_tag" will be kept. + type: boolean_true + - name: --config + alternatives: [-c] + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the original agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_embl2gff/help.txt b/src/agat/agat_convert_embl2gff/help.txt new file mode 100644 index 00000000..5fce4939 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/help.txt @@ -0,0 +1,78 @@ + ```sh +agat_convert_embl2gff.pl --help +``` + + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_converter_embl2gff.pl + +Description: + The script takes an EMBL file as input, and will translate it in gff + format. + +Usage: + agat_converter_embl2gff.pl --embl infile.embl [ -o outfile ] + +Options: + --embl Input EMBL file that will be read + + --emblmygff3 + Bolean - Means that the EMBL flat file comes from the EMBLmyGFF3 + software. This is an EMBL format dedicated for submission and + contains particularity to deal with. This parameter is needed to + get a proper sequence id in the GFF3 from an embl made with + EMBLmyGFF3. + + --primary_tag, --pt, -t + List of "primary tag". Useful to discard or keep specific + features. Multiple tags must be coma-separated. + + -d Bolean - Means that primary tags provided by the option + "primary_tag" will be discarded. + + -k Bolean - Means that only primary tags provided by the option + "primary_tag" will be kept. + + -o, --output, --out, --outfile or --gff + Output GFF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md diff --git a/src/agat/agat_convert_embl2gff/script.sh b/src/agat/agat_convert_embl2gff/script.sh new file mode 100644 index 00000000..63ab8df0 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/script.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +## VIASH START +## VIASH END + + +# unset flags +[[ "$par_emblmygff3" == "false" ]] && unset par_emblmygff3 +[[ "$par_discard" == "false" ]] && unset par_discard +[[ "$par_keep" == "false" ]] && unset par_keep + +# replace ';' with ',' +par_primary_tag=$(echo $par_primary_tag | tr ';' ',') + +# run agat_convert_embl2gff +agat_convert_embl2gff.pl \ + --embl "$par_embl" \ + -o "$par_output" \ + ${par_emblmygff3:+--emblmygff3} \ + ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \ + ${par_discard:+-d} \ + ${par_keep:+-k} \ + ${par_config:+--config "${par_config}"} diff --git a/src/agat/agat_convert_embl2gff/test.sh b/src/agat/agat_convert_embl2gff/test.sh new file mode 100644 index 00000000..81d24aaa --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out_data" + +echo "> Run $meta_name with test data and --emblmygff3" +"$meta_executable" \ + --embl "$test_dir/agat_convert_embl2gff_1.embl" \ + --output "$out_dir/output.gff" \ + --emblmygff3 + +echo ">> Checking output" +[ ! -f "$out_dir/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$out_dir/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "$out_dir/output.gff" "$test_dir/agat_convert_embl2gff_1.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" \ No newline at end of file diff --git a/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl new file mode 100644 index 00000000..aa4f50aa --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl @@ -0,0 +1,51 @@ +ID patatrac; SV 1; circular; genomic DNA; XXX; PRO; 317941 BP. +XX +AC XXX; +XX +AC * _ERS324955|SC|contig000001 +XX +PR Project:PRJEBNNNN; +XX +DE XXX +XX +RN [1] +RP 1-2149 +RA XXX; +RT ; +RL Submitted {(DD-MMM-YYYY)} to the INSDC. +XX +FH Key Location/Qualifiers +FH +FT source 1..588788 +FT /organism={"scientific organism name"} +FT /mol_type={"in vivo molecule type of sequence"} +XX +SQ Sequence 588788 BP; 101836 A; 193561 C; 192752 G; 100639 T; 0 other; + tgcgtactcg aagagacgcg cccagattat ataagggcgt cgtctcgagg ccgacggcgc 60 + gccggcgagt acgcgtgatc cacaacccga agcgaccgtc gggagaccga gggtcgtcga 120 + gggtggatac gttcctgcct tcgtgccggg aaacggccga agggaacgtg gcgacctgcg 180 +// +ID fdssf; SV 1; circular; genomic DNA; XXX; PRO; 317941 BP. +XX +AC XXX; +XX +AC * _ERS344554 +XX +PR Project:PRJEBNNNN; +XX +DE XXX +XX +RN [1] +RP 1-2149 +RA XXX; +RT ; +RL Submitted {(DD-MMM-YYYY)} to the INSDC. +XX +FH Key Location/Qualifiers +FH +FT source 1..588788 +FT /organism={"scientific organism name"} +FT /mol_type={"in vivo molecule type of sequence"} +XX +SQ Sequence 588788 BP; 101836 A; 193561 C; 192752 G; 100639 T; 0 other; + TTTTTTTTTT aagagacgcg cccagattat ataagggcgt cgtctcgagg ccgacggcgc 60 diff --git a/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff new file mode 100644 index 00000000..f6893022 --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff @@ -0,0 +1,10 @@ +##gff-version 3 +ERS324955|SC|contig000001 EMBL/GenBank/SwissProt source 1 588788 . + 1 mol_type={"in vivo molecule type of sequence"};organism={"scientific organism name"} +ERS344554 EMBL/GenBank/SwissProt source 1 588788 . + 1 mol_type={"in vivo molecule type of sequence"};organism={"scientific organism name"} +##FASTA +>ERS324955|SC|contig000001 XXX +TGCGTACTCGAAGAGACGCGCCCAGATTATATAAGGGCGTCGTCTCGAGGCCGACGGCGCGCCGGCGAGTACGCGTGATC +CACAACCCGAAGCGACCGTCGGGAGACCGAGGGTCGTCGAGGGTGGATACGTTCCTGCCTTCGTGCCGGGAAACGGCCGA +AGGGAACGTGGCGACCTGCG +>ERS344554 XXX +TTTTTTTTTTAAGAGACGCGCCCAGATTATATAAGGGCGTCGTCTCGAGGCCGACGGCGC diff --git a/src/agat/agat_convert_embl2gff/test_data/script.sh b/src/agat/agat_convert_embl2gff/test_data/script.sh new file mode 100755 index 00000000..7ddbce5b --- /dev/null +++ b/src/agat/agat_convert_embl2gff/test_data/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/agat_convert_embl2gff_1.embl src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl +cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_embl2gff_1.gff src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff \ No newline at end of file