From 6be1efda252f533ffb2755cf1f4d4ce8e2fc6df2 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:32:50 +0200 Subject: [PATCH 1/7] add help --- src/agat/agat_convert_sp_gxf2gxf/help.txt | 73 +++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/agat/agat_convert_sp_gxf2gxf/help.txt diff --git a/src/agat/agat_convert_sp_gxf2gxf/help.txt b/src/agat/agat_convert_sp_gxf2gxf/help.txt new file mode 100644 index 00000000..7658c4ed --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/help.txt @@ -0,0 +1,73 @@ +```sh +agat_convert_sp_gxf2gxf.pl --help +``` + + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_convert_sp_gxf2gxf.pl + +Description: + This script fixes and/or standardizes any GTF/GFF file into full sorted + GTF/GFF file. It AGAT parser removes duplicate features, fixes + duplicated IDs, adds missing ID and/or Parent attributes, deflates + factorized attributes (attributes with several parents are duplicated + with uniq ID), add missing features when possible (e.g. add exon if only + CDS described, add UTR if CDS and exon described), fix feature locations + (e.g. check exon is embedded in the parent features mRNA, gene), etc... + + All AGAT's scripts with the _sp_ prefix use the AGAT parser, before to + perform any supplementary task. So, it is not necessary to run this + script prior the use of any other _sp_ script. + +Usage: + agat_convert_sp_gxf2gxf.pl -g infile.gff [ -o outfile ] + agat_convert_sp_gxf2gxf.pl --help + +Options: + -g, --gtf, --gff or --gxf + String - Input GTF/GFF file. Compressed file with .gz extension + is accepted. + + -o or --output + String - Output GFF file. If no output file is specified, the + output will be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Boolean - Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md \ No newline at end of file From c71dc638965b3b99408b9bc15439ebe3b0a166da Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:40:52 +0200 Subject: [PATCH 2/7] add config --- .../agat_convert_sp_gxf2gxf/config.vsh.yaml | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml diff --git a/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml b/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml new file mode 100644 index 00000000..1be87e2a --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml @@ -0,0 +1,73 @@ +name: agat_convert_sp_gxf2gxf +namespace: agat +description: | + This script fixes and/or standardizes any GTF/GFF file into full sorted + GTF/GFF file. It AGAT parser removes duplicate features, fixes + duplicated IDs, adds missing ID and/or Parent attributes, deflates + factorized attributes (attributes with several parents are duplicated + with uniq ID), add missing features when possible (e.g. add exon if only + CDS described, add UTR if CDS and exon described), fix feature locations + (e.g. check exon is embedded in the parent features mRNA, gene), etc... + + All AGAT's scripts with the _sp_ prefix use the AGAT parser, before to + perform any supplementary task. So, it is not necessary to run this + script prior the use of any other _sp_ script. +keywords: [gene annotations, GFF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_sp_gxf2gxf.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --gxf + alternatives: [-g, --gtf, --gff] + description: | + String - Input GTF/GFF file. Compressed file with .gz extension is accepted. + type: file + required: true + direction: input + - name: Outputs + arguments: + - name: --output + alternatives: [-o] + description: | + String - Output GFF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + - name: Arguments + arguments: + - name: --config + alternatives: [-c] + description: | + String - Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the original agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file From 85abbf10c8863a8411823c76a5477d2ab9e6f07b Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:42:01 +0200 Subject: [PATCH 3/7] add run script --- src/agat/agat_convert_sp_gxf2gxf/script.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/agat/agat_convert_sp_gxf2gxf/script.sh diff --git a/src/agat/agat_convert_sp_gxf2gxf/script.sh b/src/agat/agat_convert_sp_gxf2gxf/script.sh new file mode 100644 index 00000000..2d532a41 --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +agat_convert_sp_gxf2gxf.pl \ + -g "$par_gxf" \ + -o "$par_output" \ + ${par_config:+--config "${par_config}"} From 4d9e25482ec311fadf98988c966d2395d4064530 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:52:41 +0200 Subject: [PATCH 4/7] add test data and expected output + script to fetch them --- .../test_data/0_correct_output.gff | 36 +++++++++++++++++++ .../test_data/0_test.gff | 36 +++++++++++++++++++ .../test_data/script.sh | 10 ++++++ 3 files changed, 82 insertions(+) create mode 100644 src/agat/agat_convert_sp_gxf2gxf/test_data/0_correct_output.gff create mode 100644 src/agat/agat_convert_sp_gxf2gxf/test_data/0_test.gff create mode 100755 src/agat/agat_convert_sp_gxf2gxf/test_data/script.sh diff --git a/src/agat/agat_convert_sp_gxf2gxf/test_data/0_correct_output.gff b/src/agat/agat_convert_sp_gxf2gxf/test_data/0_correct_output.gff new file mode 100644 index 00000000..fafe86ed --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/test_data/0_correct_output.gff @@ -0,0 +1,36 @@ +##gff-version 3 +scaffold625 maker gene 337818 343277 . + . ID=CLUHARG00000005458;Name=TUBB3_2 +scaffold625 maker mRNA 337818 343277 . + . ID=CLUHART00000008717;Parent=CLUHARG00000005458 +scaffold625 maker exon 337818 337971 . + . ID=CLUHART00000008717:exon:1404;Parent=CLUHART00000008717 +scaffold625 maker exon 340733 340841 . + . ID=CLUHART00000008717:exon:1405;Parent=CLUHART00000008717 +scaffold625 maker exon 341518 341628 . + . ID=CLUHART00000008717:exon:1406;Parent=CLUHART00000008717 +scaffold625 maker exon 341964 343277 . + . ID=CLUHART00000008717:exon:1407;Parent=CLUHART00000008717 +scaffold625 maker CDS 337915 337971 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 340733 340841 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341518 341628 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341964 343033 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker five_prime_UTR 337818 337914 . + . ID=CLUHART00000008717:five_prime_utr;Parent=CLUHART00000008717 +scaffold625 maker three_prime_UTR 343034 343277 . + . ID=CLUHART00000008717:three_prime_utr;Parent=CLUHART00000008717 +scaffold789 maker gene 558184 564780 . + . ID=CLUHARG00000003852;Name=PF11_0240 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006146;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006146:exon:995;Parent=CLUHART00000006146 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006146:exon:996;Parent=CLUHART00000006146 +scaffold789 maker exon 564171 564235 . + . ID=CLUHART00000006146:exon:997;Parent=CLUHART00000006146 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006146:exon:998;Parent=CLUHART00000006146 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564171 564235 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006146:five_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006146:three_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006147;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006147:exon:997;Parent=CLUHART00000006147 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006147:exon:998;Parent=CLUHART00000006147 +scaffold789 maker exon 562057 562121 . + . ID=CLUHART00000006147:exon:999;Parent=CLUHART00000006147 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006147:exon:1000;Parent=CLUHART00000006147 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 562057 562121 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006147:five_prime_utr;Parent=CLUHART00000006147 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006147:three_prime_utr;Parent=CLUHART00000006147 diff --git a/src/agat/agat_convert_sp_gxf2gxf/test_data/0_test.gff b/src/agat/agat_convert_sp_gxf2gxf/test_data/0_test.gff new file mode 100644 index 00000000..fafe86ed --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/test_data/0_test.gff @@ -0,0 +1,36 @@ +##gff-version 3 +scaffold625 maker gene 337818 343277 . + . ID=CLUHARG00000005458;Name=TUBB3_2 +scaffold625 maker mRNA 337818 343277 . + . ID=CLUHART00000008717;Parent=CLUHARG00000005458 +scaffold625 maker exon 337818 337971 . + . ID=CLUHART00000008717:exon:1404;Parent=CLUHART00000008717 +scaffold625 maker exon 340733 340841 . + . ID=CLUHART00000008717:exon:1405;Parent=CLUHART00000008717 +scaffold625 maker exon 341518 341628 . + . ID=CLUHART00000008717:exon:1406;Parent=CLUHART00000008717 +scaffold625 maker exon 341964 343277 . + . ID=CLUHART00000008717:exon:1407;Parent=CLUHART00000008717 +scaffold625 maker CDS 337915 337971 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 340733 340841 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341518 341628 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341964 343033 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker five_prime_UTR 337818 337914 . + . ID=CLUHART00000008717:five_prime_utr;Parent=CLUHART00000008717 +scaffold625 maker three_prime_UTR 343034 343277 . + . ID=CLUHART00000008717:three_prime_utr;Parent=CLUHART00000008717 +scaffold789 maker gene 558184 564780 . + . ID=CLUHARG00000003852;Name=PF11_0240 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006146;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006146:exon:995;Parent=CLUHART00000006146 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006146:exon:996;Parent=CLUHART00000006146 +scaffold789 maker exon 564171 564235 . + . ID=CLUHART00000006146:exon:997;Parent=CLUHART00000006146 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006146:exon:998;Parent=CLUHART00000006146 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564171 564235 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006146:five_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006146:three_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006147;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006147:exon:997;Parent=CLUHART00000006147 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006147:exon:998;Parent=CLUHART00000006147 +scaffold789 maker exon 562057 562121 . + . ID=CLUHART00000006147:exon:999;Parent=CLUHART00000006147 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006147:exon:1000;Parent=CLUHART00000006147 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 562057 562121 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006147:five_prime_utr;Parent=CLUHART00000006147 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006147:three_prime_utr;Parent=CLUHART00000006147 diff --git a/src/agat/agat_convert_sp_gxf2gxf/test_data/script.sh b/src/agat/agat_convert_sp_gxf2gxf/test_data/script.sh new file mode 100755 index 00000000..831dd963 --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/test_data/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/gff_syntax/in/0_test.gff src/agat/agat_convert_sp_gxf2gxf/test_data +cp -r /tmp/agat_source/t/gff_syntax/out/0_correct_output.gff src/agat/agat_convert_sp_gxf2gxf/test_data From 1be6300b1d62d878f23017f850c786ae97157583 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:52:48 +0200 Subject: [PATCH 5/7] add tests --- src/agat/agat_convert_sp_gxf2gxf/test.sh | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/agat/agat_convert_sp_gxf2gxf/test.sh diff --git a/src/agat/agat_convert_sp_gxf2gxf/test.sh b/src/agat/agat_convert_sp_gxf2gxf/test.sh new file mode 100644 index 00000000..99574b5b --- /dev/null +++ b/src/agat/agat_convert_sp_gxf2gxf/test.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out_data" + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --gxf "$test_dir/0_test.gff" \ + --output "$out_dir/output.gff" + +echo ">> Checking output" +[ ! -f "$out_dir/output.gff" ] && echo "Output file output.gff does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$out_dir/output.gff" ] && echo "Output file output.gff is empty" && exit 1 + + +echo ">> Check if output matches expected output" +diff "$out_dir/output.gff" "$test_dir/0_correct_output.gff" +if [ $? -ne 0 ]; then + echo "Output file output.gff does not match expected output" + exit 1 +fi + +echo "> Test successful" \ No newline at end of file From 14c30f251f7e54dfa0fcd5240a2ff5c4aec5c728 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:55:45 +0200 Subject: [PATCH 6/7] add example to config --- src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml b/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml index 1be87e2a..9e77b09d 100644 --- a/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml +++ b/src/agat/agat_convert_sp_gxf2gxf/config.vsh.yaml @@ -35,6 +35,7 @@ argument_groups: type: file required: true direction: input + example: input.gff - name: Outputs arguments: - name: --output @@ -44,6 +45,7 @@ argument_groups: type: file direction: output required: true + example: output.gff - name: Arguments arguments: - name: --config From dd64681b146083e7f68f3c3124e1dc55a2547ce6 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 31 Jul 2024 10:58:25 +0200 Subject: [PATCH 7/7] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4575cb9..917a6eac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ * `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). +* `agat/agat_convert_sp_gxf2gxf`: fixes and/or standardizes any GTF/GFF file into full sorted GTF/GFF file (PR #103). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72).