From d6c9475ccf825f2df5666cdd0baf4048e98b8812 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Sat, 26 Oct 2024 15:07:08 +0200 Subject: [PATCH] Add agat sp statistics (#107) * add help * add config * add running script * add test data and expected output + script to fetch them * add tests * update changelog * cleanup * config: replace `-d` by a longer name `--plot` * add set -eo pipefail to script and test files * create temporary directory and clean up on exit * improve config: add requirements, add keywords, format description,.. * cleanup changelog * PR fixes, extended unit tests * Smaller test data, small changes to version format and config format --------- Co-authored-by: Robrecht Cannoodt Co-authored-by: jakubmajercik Co-authored-by: Emma Rousseau --- CHANGELOG.md | 7 ++ src/agat/agat_sp_statistics/config.vsh.yaml | 93 +++++++++++++++++++ src/agat/agat_sp_statistics/help.txt | 60 ++++++++++++ src/agat/agat_sp_statistics/script.sh | 26 ++++++ src/agat/agat_sp_statistics/test.sh | 65 +++++++++++++ src/agat/agat_sp_statistics/test_data/1.gff | 78 ++++++++++++++++ .../agat_sp_statistics/test_data/script.sh | 14 +++ .../test_data/stats_out.txt | 93 +++++++++++++++++++ 8 files changed, 436 insertions(+) create mode 100644 src/agat/agat_sp_statistics/config.vsh.yaml create mode 100644 src/agat/agat_sp_statistics/help.txt create mode 100644 src/agat/agat_sp_statistics/script.sh create mode 100644 src/agat/agat_sp_statistics/test.sh create mode 100644 src/agat/agat_sp_statistics/test_data/1.gff create mode 100755 src/agat/agat_sp_statistics/test_data/script.sh create mode 100644 src/agat/agat_sp_statistics/test_data/stats_out.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e59f784..dbc4d95d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ * `agat`: - `agat/agat_convert_genscan2gff`: convert a genscan file into a GFF file (PR #100). + - `agat_sp_statistics`: provides exhaustive statistics of a gft/gff file (PR #107). + * `bd_rhapsody/bd_rhapsody_sequence_analysis`: BD Rhapsody Sequence Analysis CWL pipeline (PR #96). @@ -49,12 +51,16 @@ based on a provided sequence IDs or region coordinates file (PR #85). * `agat`: + - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). + - `agat_convert_bed2gff`: convert bed file to gff format (PR #97). + - `agat_convert_embl2gff`: convert an EMBL file into GFF format (PR #99). - `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). - `agat/agat_convert_bed2gff`: convert bed file to gff format (PR #97). - `agat/agat_convert_embl2gff`: convert an EMBL file into GFF format (PR #99). - `agat/agat_convert_sp_gff2tsv`: convert gtf/gff file into tabulated file (PR #102). - `agat/agat_convert_sp_gxf2gxf`: fixes and/or standardizes any GTF/GFF file into full sorted GTF/GFF file (PR #103). + * `bedtools`: - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94). - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98). @@ -91,6 +97,7 @@ * `trimgalore`: Quality and adapter trimming for fastq files (PR #117). + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). diff --git a/src/agat/agat_sp_statistics/config.vsh.yaml b/src/agat/agat_sp_statistics/config.vsh.yaml new file mode 100644 index 00000000..6890bb84 --- /dev/null +++ b/src/agat/agat_sp_statistics/config.vsh.yaml @@ -0,0 +1,93 @@ +name: agat_sp_statistics +namespace: agat +description: | + The script provides exhaustive statistics of a gft/gff file. + + If you have isoforms in your file, even if correct, some values calculated + might sounds incoherent: e.g. total length mRNA can be superior than the + genome size. Because all isoforms length is added... It is why by + default we always compute the statistics twice when there are isoforms, + once with the isoforms, once without (In that case we keep the longest + isoform per locus). +keywords: [gene annotations, statistics, gff] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/en/latest/tools/agat_sp_statistics.html + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +requirements: + - commands: [agat] +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --gff + alternatives: [-i] + description: Input GTF/GFF file. + type: file + required: true + example: input.gff + - name: --gs_fasta + description: | + Genome size directly from a fasta file to compute more statistics. + type: file + example: genome.fasta + - name: Outputs + arguments: + - name: --output + alternatives: [-o] + description: | + The file where the results will be written. + type: file + direction: output + required: true + example: output.txt + - name: Options + arguments: + - name: --plot + alternatives: [-p, -d] + description: | + When this option is used, an histogram of distribution of the features will be printed in pdf files. + type: boolean_true + - name: --gs_size + description: | + Genome size in nucleotides to compute more statistics. + type: integer + example: 1000000 + - name: --verbose + alternatives: [-v] + description: | + Verbose option. To modify verbosity. Default is 1. 0 is quiet, 2 and 3 are increasing verbosity. + type: integer + example: 1 + - name: --config + alternatives: [-c] + description: | + AGAT config file. By default AGAT takes the original agat_config.yaml shipped with AGAT. The `--config` + option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/.*v\.//; s/\s.*//' | sed 's/^/AGAT: /' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_sp_statistics/help.txt b/src/agat/agat_sp_statistics/help.txt new file mode 100644 index 00000000..fa6ef24d --- /dev/null +++ b/src/agat/agat_sp_statistics/help.txt @@ -0,0 +1,60 @@ +```sh +agat_sp_statistics.pl --help +``` + + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_sp_statistics.pl + +Description: + The script provides exhaustive statistics of a gft/gff file. /!\ If you + have isoforms in your file, even if correct, some values calculated + might sounds incoherent: e.g. total length mRNA can be superior than the + genome size. Because all isoforms length is added... It is why by + default we always compute the statistics twice when there are isoforms, + once with the isoforms, once without (In that case we keep the longest + isoform per locus). + +Usage: + agat_sp_statistics.pl --gff file.gff [ -o outfile ] + agat_sp_statistics.pl --help + +Options: + --gff or -i + Input GTF/GFF file. + + --gs, -f or -g + This option inform about the genome size in oder to compute more + statistics. You can give the size in Nucleotide or directly the + fasta file. + + -d or -p + When this option is used, an histogram of distribution of the + features will be printed in pdf files. (d means distribution, p + means plot). + + -v or --verbose + Verbose option. To modify verbosity. Default is 1. 0 is quiet, 2 + and 3 are increasing verbosity. + + --output or -o + File where will be written the result. If no output file is + specified, the output will be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. \ No newline at end of file diff --git a/src/agat/agat_sp_statistics/script.sh b/src/agat/agat_sp_statistics/script.sh new file mode 100644 index 00000000..9865c4b2 --- /dev/null +++ b/src/agat/agat_sp_statistics/script.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +## VIASH END + +# unset flags +[[ "$par_d" == "false" ]] && unset par_d + +if [[ -n "$par_gs_size" && -n "$par_gs_fasta" ]]; then + echo "[error] Please provide only one of the following options to set genome size: --gs_size or --gs_fasta" + exit 1 +fi + +# run agat_sp_statistics +agat_sp_statistics.pl \ + -i "$par_gff" \ + -o "$par_output" \ + ${par_plot:+-d} \ + ${par_gs_size:+--gs "${par_gs_size}"} \ + ${par_gs_fasta:+--gs "${par_gs_fasta}"} \ + ${par_verbose:+--verbose "${par_verbose}"} \ + ${par_config:+--config "${par_config}"} + + diff --git a/src/agat/agat_sp_statistics/test.sh b/src/agat/agat_sp_statistics/test.sh new file mode 100644 index 00000000..35f42ee0 --- /dev/null +++ b/src/agat/agat_sp_statistics/test.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -eo pipefail + +test_dir="${meta_resources_dir}/test_data" + +# create temporary directory and clean up on exit +TMPDIR=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -rf "$TMPDIR" +} +trap clean_up EXIT + +cd "$TMPDIR" + +mkdir test1 +pushd test1 + +echo "> Run $meta_name with test data and --emblmygff3" +"$meta_executable" \ + --gff "$test_dir/1.gff" \ + --output "output.txt" \ + +echo ">> Checking output" +[ ! -f "output.txt" ] && echo "Output file output.txt does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "output.txt" ] && echo "Output file output.txt is empty" && exit 1 + +echo ">> Check if output matches expected output" +diff "output.txt" "$test_dir/stats_out.txt" +if [ $? -ne 0 ]; then + echo "Output file output.txt does not match expected output" + exit 1 +fi + +echo "> Test successful" + + +popd +mkdir test2 +pushd test2 + +cat < genome.fasta +>sample_sequence +ATGCGTACGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC +EOF + +echo "> Run $meta_name with both gs_size and gs_fasta" +error_message=$("$meta_executable" \ + --gff "$test_dir/1.gff" \ + --output "output.txt" \ + --gs_size "1000000" \ + --gs_fasta "genome.fasta" 2>&1 || true) + +expected_error="[error] Please provide only one of the following options to set genome size: --gs_size or --gs_fasta" +if [[ "$error_message" != *"$expected_error"* ]]; then + echo "Output error message: $error_message does not match expected error message: $expected_error" + exit 1 +fi + +echo "> Error test successful" + +echo "---- All tests succeeded! ----" +exit 0 \ No newline at end of file diff --git a/src/agat/agat_sp_statistics/test_data/1.gff b/src/agat/agat_sp_statistics/test_data/1.gff new file mode 100644 index 00000000..775d14fd --- /dev/null +++ b/src/agat/agat_sp_statistics/test_data/1.gff @@ -0,0 +1,78 @@ +##gff-version 3 +##sequence-region 1 1 43270923 +#!genome-build RAP-DB IRGSP-1.0 +#!genome-version IRGSP-1.0 +#!genome-date 2015-10 +#!genome-build-accession GCA_001433935.1 +1 RAP-DB chromosome 1 43270923 . . . ID=chromosome:1;Alias=Chr1,AP014957.1,NC_029256.1 +### +1 irgsp repeat_region 2000 2100 . + . ID=fakeRepeat1 +### +1 irgsp gene 2983 10815 . + . ID=gene:Os01g0100100;biotype=protein_coding;description=RabGAP/TBC domain containing protein. (Os01t0100100-01);gene_id=Os01g0100100;logic_name=irgspv1.0-20170804-genes +1 irgsp mRNA 2983 10815 . + . ID=transcript:Os01t0100100-01;Parent=gene:Os01g0100100;biotype=protein_coding;transcript_id=Os01t0100100-01 +1 irgsp exon 2983 3268 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon1;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100100-01.exon1;rank=1 +1 irgsp five_prime_UTR 2983 3268 . + . Parent=transcript:Os01t0100100-01 +1 irgsp five_prime_UTR 3354 3448 . + . Parent=transcript:Os01t0100100-01 +1 irgsp exon 3354 3616 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=-1;exon_id=Os01t0100100-01.exon2;rank=2 +1 irgsp CDS 3449 3616 . + 0 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 4357 4455 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=Os01t0100100-01.exon3;rank=3 +1 irgsp CDS 4357 4455 . + 0 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 5457 5560 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon4;constitutive=1;ensembl_end_phase=2;ensembl_phase=0;exon_id=Os01t0100100-01.exon4;rank=4 +1 irgsp CDS 5457 5560 . + 0 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 7136 7944 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon5;constitutive=1;ensembl_end_phase=1;ensembl_phase=2;exon_id=Os01t0100100-01.exon5;rank=5 +1 irgsp CDS 7136 7944 . + 1 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 8028 8150 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon6;constitutive=1;ensembl_end_phase=1;ensembl_phase=1;exon_id=Os01t0100100-01.exon6;rank=6 +1 irgsp CDS 8028 8150 . + 2 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 8232 8320 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon7;constitutive=1;ensembl_end_phase=0;ensembl_phase=1;exon_id=Os01t0100100-01.exon7;rank=7 +1 irgsp CDS 8232 8320 . + 2 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 8408 8608 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon8;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=Os01t0100100-01.exon8;rank=8 +1 irgsp CDS 8408 8608 . + 0 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 9210 9615 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon9;constitutive=1;ensembl_end_phase=1;ensembl_phase=0;exon_id=Os01t0100100-01.exon9;rank=9 +1 irgsp CDS 9210 9615 . + 0 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 10102 10187 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon10;constitutive=1;ensembl_end_phase=0;ensembl_phase=1;exon_id=Os01t0100100-01.exon10;rank=10 +1 irgsp CDS 10102 10187 . + 2 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp CDS 10274 10297 . + 0 ID=CDS:Os01t0100100-01;Parent=transcript:Os01t0100100-01;protein_id=Os01t0100100-01 +1 irgsp exon 10274 10430 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon11;constitutive=1;ensembl_end_phase=-1;ensembl_phase=0;exon_id=Os01t0100100-01.exon11;rank=11 +1 irgsp three_prime_UTR 10298 10430 . + . Parent=transcript:Os01t0100100-01 +1 irgsp exon 10504 10815 . + . Parent=transcript:Os01t0100100-01;Name=Os01t0100100-01.exon12;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100100-01.exon12;rank=12 +1 irgsp three_prime_UTR 10504 10815 . + . Parent=transcript:Os01t0100100-01 +### +1 irgsp gene 11218 12435 . + . ID=gene:Os01g0100200;biotype=protein_coding;description=Conserved hypothetical protein. (Os01t0100200-01);gene_id=Os01g0100200;logic_name=irgspv1.0-20170804-genes +1 irgsp mRNA 11218 12435 . + . ID=transcript:Os01t0100200-01;Parent=gene:Os01g0100200;biotype=protein_coding;transcript_id=Os01t0100200-01 +1 irgsp five_prime_UTR 11218 11797 . + . Parent=transcript:Os01t0100200-01 +1 irgsp exon 11218 12060 . + . Parent=transcript:Os01t0100200-01;Name=Os01t0100200-01.exon1;constitutive=1;ensembl_end_phase=2;ensembl_phase=-1;exon_id=Os01t0100200-01.exon1;rank=1 +1 irgsp CDS 11798 12060 . + 0 ID=CDS:Os01t0100200-01;Parent=transcript:Os01t0100200-01;protein_id=Os01t0100200-01 +1 irgsp CDS 12152 12317 . + 1 ID=CDS:Os01t0100200-01;Parent=transcript:Os01t0100200-01;protein_id=Os01t0100200-01 +1 irgsp exon 12152 12435 . + . Parent=transcript:Os01t0100200-01;Name=Os01t0100200-01.exon2;constitutive=1;ensembl_end_phase=-1;ensembl_phase=2;exon_id=Os01t0100200-01.exon2;rank=2 +1 irgsp three_prime_UTR 12318 12435 . + . Parent=transcript:Os01t0100200-01 +### +1 irgsp gene 11372 12284 . - . ID=gene:Os01g0100300;biotype=protein_coding;description=Cytochrome P450 domain containing protein. (Os01t0100300-00);gene_id=Os01g0100300;logic_name=irgspv1.0-20170804-genes +1 irgsp mRNA 11372 12284 . - . ID=transcript:Os01t0100300-00;Parent=gene:Os01g0100300;biotype=protein_coding;transcript_id=Os01t0100300-00 +1 irgsp exon 11372 12042 . - . Parent=transcript:Os01t0100300-00;Name=Os01t0100300-00.exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=1;exon_id=Os01t0100300-00.exon2;rank=2 +1 irgsp CDS 11372 12042 . - 2 ID=CDS:Os01t0100300-00;Parent=transcript:Os01t0100300-00;protein_id=Os01t0100300-00 +1 irgsp exon 12146 12284 . - . Parent=transcript:Os01t0100300-00;Name=Os01t0100300-00.exon1;constitutive=1;ensembl_end_phase=1;ensembl_phase=0;exon_id=Os01t0100300-00.exon1;rank=1 +1 irgsp CDS 12146 12284 . - 0 ID=CDS:Os01t0100300-00;Parent=transcript:Os01t0100300-00;protein_id=Os01t0100300-00 +### +1 irgsp gene 12721 15685 . + . ID=gene:Os01g0100400;biotype=protein_coding;description=Similar to Pectinesterase-like protein. (Os01t0100400-01);gene_id=Os01g0100400;logic_name=irgspv1.0-20170804-genes +1 irgsp mRNA 12721 15685 . + . ID=transcript:Os01t0100400-01;Parent=gene:Os01g0100400;biotype=protein_coding;transcript_id=Os01t0100400-01 +1 irgsp five_prime_UTR 12721 12773 . + . Parent=transcript:Os01t0100400-01 +1 irgsp exon 12721 13813 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon1;constitutive=1;ensembl_end_phase=2;ensembl_phase=-1;exon_id=Os01t0100400-01.exon1;rank=1 +1 irgsp CDS 12774 13813 . + 0 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01 +1 irgsp exon 13906 14271 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon2;constitutive=1;ensembl_end_phase=2;ensembl_phase=2;exon_id=Os01t0100400-01.exon2;rank=2 +1 irgsp CDS 13906 14271 . + 1 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01 +1 irgsp exon 14359 14437 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=2;exon_id=Os01t0100400-01.exon3;rank=3 +1 irgsp CDS 14359 14437 . + 1 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01 +1 irgsp exon 14969 15171 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon4;constitutive=1;ensembl_end_phase=2;ensembl_phase=0;exon_id=Os01t0100400-01.exon4;rank=4 +1 irgsp CDS 14969 15171 . + 0 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01 +1 irgsp CDS 15266 15359 . + 1 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01 +1 irgsp exon 15266 15685 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon5;constitutive=1;ensembl_end_phase=-1;ensembl_phase=2;exon_id=Os01t0100400-01.exon5;rank=5 +1 irgsp three_prime_UTR 15360 15685 . + . Parent=transcript:Os01t0100400-01 +### +1 irgsp gene 12808 13978 . - . ID=gene:Os01g0100466;biotype=protein_coding;description=Hypothetical protein. (Os01t0100466-00);gene_id=Os01g0100466;logic_name=irgspv1.0-20170804-genes +1 irgsp mRNA 12808 13978 . - . ID=transcript:Os01t0100466-00;Parent=gene:Os01g0100466;biotype=protein_coding;transcript_id=Os01t0100466-00 +1 irgsp three_prime_UTR 12808 12868 . - . Parent=transcript:Os01t0100466-00 +1 irgsp exon 12808 13782 . - . Parent=transcript:Os01t0100466-00;Name=Os01t0100466-00.exon2;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100466-00.exon2;rank=2 +1 irgsp CDS 12869 13102 . - 0 ID=CDS:Os01t0100466-00;Parent=transcript:Os01t0100466-00;protein_id=Os01t0100466-00 +1 irgsp five_prime_UTR 13103 13782 . - . Parent=transcript:Os01t0100466-00 +1 irgsp exon 13880 13978 . - . Parent=transcript:Os01t0100466-00;Name=Os01t0100466-00.exon1;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100466-00.exon1;rank=1 +1 irgsp five_prime_UTR 13880 13978 . - . Parent=transcript:Os01t0100466-00 \ No newline at end of file diff --git a/src/agat/agat_sp_statistics/test_data/script.sh b/src/agat/agat_sp_statistics/test_data/script.sh new file mode 100755 index 00000000..5b1133ac --- /dev/null +++ b/src/agat/agat_sp_statistics/test_data/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/scripts_output/in/1.gff src/agat/agat_sp_statistics/test_data +cp -r /tmp/agat_source/t/scripts_output/out/agat_sp_statistics_1.txt src/agat/agat_sp_statistics/test_data + +# keep only the first 78 lines of 1.gff +head -n 78 src/agat/agat_sp_statistics/test_data/1.gff > src/agat/agat_sp_statistics/test_data/1.gff.tmp +mv src/agat/agat_sp_statistics/test_data/1.gff.tmp src/agat/agat_sp_statistics/test_data/1.gff \ No newline at end of file diff --git a/src/agat/agat_sp_statistics/test_data/stats_out.txt b/src/agat/agat_sp_statistics/test_data/stats_out.txt new file mode 100644 index 00000000..b160ea52 --- /dev/null +++ b/src/agat/agat_sp_statistics/test_data/stats_out.txt @@ -0,0 +1,93 @@ +-------------------------------------------------------------------------------- + +---------------------------------- chromosome ---------------------------------- +Number of chromosome 1 +Number chromosome overlapping 0 +Total chromosome length (bp) 43270923 +mean chromosome length (bp) 43270923 +Longest chromosome (bp) 43270923 +Shortest chromosome (bp) 43270923 + +-------------------------------- repeat_region --------------------------------- +Number of repeat_region 1 +Number repeat_region overlapping 0 +Total repeat_region length (bp) 101 +mean repeat_region length (bp) 101 +Longest repeat_region (bp) 101 +Shortest repeat_region (bp) 101 + +------------------------------------- mrna ------------------------------------- +Number of gene 5 +Number of mrna 5 +Number of mrnas with utr both sides 4 +Number of mrnas with at least one utr 4 +Number of cds 5 +Number of exon 23 +Number of five_prime_utr 4 +Number of three_prime_utr 4 +Number of exon in cds 20 +Number of exon in five_prime_utr 6 +Number of exon in three_prime_utr 5 +Number of intron in cds 15 +Number of intron in exon 18 +Number of intron in five_prime_utr 2 +Number of intron in three_prime_utr 1 +Number gene overlapping 2 +mean mrnas per gene 1.0 +mean cdss per mrna 1.0 +mean exons per mrna 4.6 +mean five_prime_utrs per mrna 0.8 +mean three_prime_utrs per mrna 0.8 +mean exons per cds 4.0 +mean exons per five_prime_utr 1.5 +mean exons per three_prime_utr 1.2 +mean introns in cdss per mrna 3.0 +mean introns in exons per mrna 3.6 +mean introns in five_prime_utrs per mrna 0.4 +mean introns in three_prime_utrs per mrna 0.2 +Total gene length (bp) 14100 +Total mrna length (bp) 14100 +Total cds length (bp) 5364 +Total exon length (bp) 8107 +Total five_prime_utr length (bp) 1793 +Total three_prime_utr length (bp) 950 +Total intron length per cds (bp) 5738 +Total intron length per exon (bp) 5993 +Total intron length per five_prime_utr (bp) 182 +Total intron length per three_prime_utr (bp) 73 +mean gene length (bp) 2820 +mean mrna length (bp) 2820 +mean cds length (bp) 1072 +mean exon length (bp) 352 +mean five_prime_utr length (bp) 448 +mean three_prime_utr length (bp) 237 +mean cds piece length (bp) 268 +mean five_prime_utr piece length (bp) 298 +mean three_prime_utr piece length (bp) 190 +mean intron in cds length (bp) 382 +mean intron in exon length (bp) 332 +mean intron in five_prime_utr length (bp) 91 +mean intron in three_prime_utr length (bp) 73 +Longest gene (bp) 7833 +Longest mrna (bp) 7833 +Longest cds (bp) 2109 +Longest exon (bp) 1093 +Longest five_prime_utr (bp) 779 +Longest three_prime_utr (bp) 445 +Longest cds piece (bp) 1040 +Longest five_prime_utr piece (bp) 680 +Longest three_prime_utr piece (bp) 326 +Longest intron into cds part (bp) 1575 +Longest intron into exon part (bp) 1575 +Longest intron into five_prime_utr part (bp) 97 +Longest intron into three_prime_utr part (bp)73 +Shortest gene (bp) 913 +Shortest mrna (bp) 913 +Shortest cds piece (bp) 24 +Shortest five_prime_utr piece (bp) 53 +Shortest three_prime_utr piece (bp) 61 +Shortest intron into cds part (bp) 81 +Shortest intron into exon part (bp) 73 +Shortest intron into five_prime_utr part (bp)85 +Shortest intron into three_prime_utr part (bp)73 +