diff --git a/CHANGELOG.md b/CHANGELOG.md index e987a225..709ca4a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ * `star/star_align_reads`: Align reads to a reference genome (PR #22). -* `gffread`: Validate, filter, convert and perform other operations on GFF files (PR #25). +* `gffread`: Validate, filter, convert and perform other operations on GFF files (PR #29). ## MAJOR CHANGES diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index 705b8cb4..009b852a 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -34,7 +34,7 @@ argument_groups: direction: input description: | is a tab-delimited file providing this info for each of the mapped - sequences: (useful for -A option with + sequences: (useful for --description option with mRNA/EST/protein mappings). - name: --genome alternatives: -g @@ -52,7 +52,8 @@ argument_groups: direction: output must_exist: false description: | - Write the output records into instead of stdout. + Write the output records into . + default: output.gff - name: --force_exons type: boolean_true description: | @@ -93,12 +94,12 @@ argument_groups: - name: --w_add type: integer description: | - For the -w option, extract additional bases both upstream and downstream of - the transcript boundaries. + For the --spliced_exons option, extract additional bases both upstream and + downstream of the transcript boundaries. - name: --w_nocds type: boolean_true description: | - For -w, disable the output of CDS info in the FASTA file. + For --spliced_exons, disable the output of CDS info in the FASTA file. - name: --spliced_cds alternatives: -x type: file @@ -117,13 +118,13 @@ argument_groups: alternatives: -W type: boolean_true description: | - For -w, -x and -y options, write in the FASTA defline all the exon coordinates - projected onto the spliced sequence. + For --spliced_exons, --spliced_cds and -tr_cds options, write in the FASTA defline + all the exon coordinates projected onto the spliced sequence. - name: --stop_dot alternatives: -S type: boolean_true description: | - For -y option, use '*' instead of '.' as stop codon translation. + For --tr_cds option, use '*' instead of '.' as stop codon translation. - name: --id_version alternatives: -L type: boolean_true @@ -160,8 +161,9 @@ argument_groups: of GFF attributes given in ; special pseudo-attributes (prefixed by @) are recognized: @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen - If any of -w/-y/-x FASTA output files are enabled, the same fields (excluding @id) are - appended to the definition line of corresponding FASTA records. + If any of --spliced_exons/--tr_cds/--spliced_cds FASTA output files are enabled, the + same fields (excluding @id) are appended to the definition line of corresponding FASTA + records. - name: --expose_dups type: boolean_true alternatives: [-E, -v] @@ -198,7 +200,7 @@ argument_groups: alternatives: -R type: boolean_true description: | - For -r option, discard all transcripts that are not fully contained within the given + For --range option, discard all transcripts that are not fully contained within the given range. - name: --jmatch type: string @@ -279,25 +281,25 @@ argument_groups: alternatives: -V type: boolean_true description: | - Discard any mRNAs with CDS having in-frame stop codons (requires -g). + Discard any mRNAs with CDS having in-frame stop codons (requires --genome). - name: --adj_cds_start alternatives: -H type: boolean_true description: | - For -V option, check and adjust the starting CDS phase if the original phase leads to a - translation with an in-frame stop codon. + For --rm_stop_codons option, check and adjust the starting CDS phase if the original phase + leads to a translation with an in-frame stop codon. - name: --opposite_strand alternatives: -B type: boolean_true description: | For -V option, single-exon transcripts are also checked on the opposite strand (requires - -g). + --genome). - name: --coding_status alternatives: -P type: boolean_true description: | Add transcript level GFF attributes about the coding status of each transcript, including - partialness or in-frame stop codons (requires -g). + partialness or in-frame stop codons (requires --genome). - name: --add_hasCDS type: boolean_true description: | @@ -305,7 +307,7 @@ argument_groups: - name: --adj_stop type: boolean_true description: | - Stop codon adjustment: enables -P and performs automatic adjustment of the CDS stop + Stop codon adjustment: enables --coding_status and performs automatic adjustment of the CDS stop coordinate if premature or downstream. - name: --rm_noncanon alternatives: -N @@ -350,31 +352,31 @@ argument_groups: alternatives: -d type: file description: | - For -M option, write duplication info to file . + For --merge option, write duplication info to file . - name: --cluster_only type: boolean_true description: | - Same as -M/--merge but without discarding any of the "duplicate" transcripts, only create + Same as --merge but without discarding any of the "duplicate" transcripts, only create "locus" features. - name: --rm_redundant alternatives: -K type: boolean_true description: | - For -M option: also discard as redundant the shorter, fully contained transcripts (intron + For --merge option: also discard as redundant the shorter, fully contained transcripts (intron chains matching a part of the container). - name: --no_boundary alternatives: -Q type: boolean_true description: | - For -M option, no longer require boundary containment when assessing redundancy (can be - combined with -K); only introns have to match for multi-exon transcripts, and >=80% overlap - for single-exon transcripts. + For --merge option, no longer require boundary containment when assessing redundancy (can be + combined with --rm_redundant); only introns have to match for multi-exon transcripts, and >=80% + overlap for single-exon transcripts. - name: --no_overlap alternatives: -Y type: boolean_true description: | - For -M option, enforce -Q but also discard overlapping single-exon transcripts, even on - the opposite strand (can be combined with -K). + For --merge option, enforce --no_boundary but also discard overlapping single-exon transcripts, + even on the opposite strand (can be combined with --rm_redudant). resources: - type: bash_script @@ -390,7 +392,7 @@ engines: setup: - type: docker run: | - echo "gffread: \"0.12.7\"" > /var/software_versions.txt + echo "gffread: \"$(gffread --version 2>&1)\"" > /var/software_versions.txt runners: - type: executable - type: nextflow \ No newline at end of file diff --git a/src/gffread/test.sh b/src/gffread/test.sh index c7553839..5eebe9a6 100755 --- a/src/gffread/test.sh +++ b/src/gffread/test.sh @@ -18,8 +18,9 @@ echo "> Test 1 - Read annotation file, output GFF" "$meta_executable" \ --expose_dups \ - --input "$test_dir/annotation.gff" \ - -o "$test_output_dir/ann_simple.gff" + --outfile "$test_output_dir/ann_simple.gff" \ + --input "$test_dir/sequence.gff3" + echo ">> Check if output exists" [ ! -f "$test_output_dir/ann_simple.gff" ] \ @@ -31,13 +32,11 @@ echo ">> Check if output is empty" echo ">> Compare output to expected output" -# Not comparing header lines as they are not in the same order -# (reference file was created with gffred version 11.8 instead of 12.7). -diff <(grep -v '^#' "$expected_output_dir/ann_simple.gff") \ - <(grep -v '^#' "$test_output_dir/ann_simple.gff") || \ +# compare file expect lines starting with "#" +diff <(grep -v "^#" "$expected_output_dir/ann_simple.gff") \ + <(grep -v "^#" "$test_output_dir/ann_simple.gff") || \ (echo "Output file ann_simple.gff does not match expected output" && exit 1) - ################################################################################ echo "> Test 2 - Read annotation file, output GTF" @@ -45,7 +44,7 @@ echo "> Test 2 - Read annotation file, output GTF" "$meta_executable" \ --gtf_output \ --outfile "$test_output_dir/annotation.gtf" \ - --input "$test_dir/annotation.gff" + --input "$test_dir/sequence.gff3" echo ">> Check if output exists" [ ! -f "$test_output_dir/annotation.gtf" ] \ @@ -56,13 +55,6 @@ echo ">> Check if output is empty" && echo "Output file test_output/annotation.gtf is empty" && exit 1 echo ">> Compare output to expected output" - -# removing trailing semicolons from the files -# Difference in trailing semicolon presence possibly due to reference output -# being generated by gffred version 11.8 instead of 12.7. -sed -i 's/;$/''/' "$expected_output_dir/annotation.gtf" -sed -i 's/;$/''/' "$test_output_dir/annotation.gtf" - diff "$expected_output_dir/annotation.gtf" "$test_output_dir/annotation.gtf" || \ (echo "Output file annotation.gtf does not match expected output" && exit 1) @@ -72,9 +64,9 @@ echo "> Test 3 - Generate fasta file from annotation file" "$meta_executable" \ - --genome "$test_dir/genome.fa" \ + --genome "$test_dir/sequence.fasta" \ --spliced_exons "$test_output_dir/transcripts.fa" \ - --input "$test_dir/annotation.gff" + --input "$test_dir/sequence.gff3" echo ">> Check if output exists" [ ! -f "$test_output_dir/transcripts.fa" ] \ @@ -92,12 +84,10 @@ diff "$expected_output_dir/transcripts.fa" "$test_output_dir/transcripts.fa" || echo "> Test 4 - Generate table from GFF annotation file" -# reference output annotation.tbl was created manually with gffread 0.12.7 - "$meta_executable" \ --table @id,@chr,@start,@end,@strand,@exons,Name,gene,product \ --outfile "$test_output_dir/annotation.tbl" \ - --input "$test_dir/annotation.gff" + --input "$test_dir/sequence.gff3" echo ">> Check if output exists" [ ! -f "$test_output_dir/annotation.tbl" ] \ diff --git a/src/gffread/test_data/output/ann_simple.gff b/src/gffread/test_data/output/ann_simple.gff index 828ee2fa..c8e5e933 100644 --- a/src/gffread/test_data/output/ann_simple.gff +++ b/src/gffread/test_data/output/ann_simple.gff @@ -1,85 +1,5 @@ -# gffread -E annotation.gff -o ann_simple.gff -# gffread v0.11.8 ##gff-version 3 -NT_187562.1 BestRefSeq mRNA 411 68627 . + . ID=rna157470;geneID=gene55473;gene_name=MGAM -NT_187562.1 BestRefSeq exon 411 495 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 1995 2051 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 2602 2726 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 9665 9753 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 12115 12164 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 12577 12744 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 14174 14326 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 14664 14864 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 16634 16788 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 17438 17606 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 17880 17976 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 18710 18822 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 20083 20208 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 21352 21480 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 21736 21846 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 22191 22253 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 24448 24582 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 25379 25466 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 26264 26402 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 27215 27348 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 56500 56534 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 56627 56743 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 57445 57593 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 58211 58295 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 59473 59529 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 61493 61617 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 62682 62770 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 64510 64559 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 65149 65319 . + . Parent=rna157470 -NT_187562.1 BestRefSeq exon 67694 68627 . + . Parent=rna157470 -NT_187562.1 BestRefSeq CDS 411 495 . + 1 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 1995 2051 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 2602 2726 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 9665 9753 . + 1 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 12115 12164 . + 2 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 12577 12744 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 14174 14326 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 14664 14864 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 16634 16788 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 17438 17606 . + 1 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 17880 17976 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 18710 18822 . + 2 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 20083 20208 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 21352 21480 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 21736 21846 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 22191 22253 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 24448 24582 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 25379 25466 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 26264 26402 . + 2 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 27215 27348 . + 1 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 56500 56534 . + 2 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 56627 56743 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 57445 57593 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 58211 58295 . + 1 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 59473 59529 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 61493 61617 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 62682 62770 . + 1 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 64510 64559 . + 2 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 65149 65319 . + 0 Parent=rna157470 -NT_187562.1 BestRefSeq CDS 67694 67771 . + 0 Parent=rna157470 -NT_187562.1 Curated Genomic mRNA 214038 219958 . - . ID=rna157473;geneID=gene55476;gene_name=PRSS58 -NT_187562.1 Curated Genomic exon 214038 214270 . - . Parent=rna157473 -NT_187562.1 Curated Genomic exon 214372 214511 . - . Parent=rna157473 -NT_187562.1 Curated Genomic exon 216955 217211 . - . Parent=rna157473 -NT_187562.1 Curated Genomic exon 217435 217573 . - . Parent=rna157473 -NT_187562.1 Curated Genomic exon 219568 219648 . - . Parent=rna157473 -NT_187562.1 Curated Genomic exon 219891 219958 . - . Parent=rna157473 -NT_187562.1 Curated Genomic CDS 214121 214270 . - 0 Parent=rna157473 -NT_187562.1 Curated Genomic CDS 214372 214511 . - 2 Parent=rna157473 -NT_187562.1 Curated Genomic CDS 216955 217211 . - 1 Parent=rna157473 -NT_187562.1 Curated Genomic CDS 217435 217573 . - 2 Parent=rna157473 -NT_187562.1 Curated Genomic CDS 219568 219607 . - 0 Parent=rna157473 -NT_187562.1 Curated Genomic transcript 230181 234148 . - . ID=rna157474;geneID=gene55477;gene_name=TRY2P -NT_187562.1 Curated Genomic exon 230181 231784 . - . Parent=rna157474 -NT_187562.1 Curated Genomic exon 233047 233200 . - . Parent=rna157474 -NT_187562.1 Curated Genomic exon 233852 234148 . - . Parent=rna157474 -NT_187562.1 BestRefSeq mRNA 962573 963937 . + . ID=rna157497;geneID=gene55581;gene_name=C7orf34 -NT_187562.1 BestRefSeq exon 962573 962821 . + . Parent=rna157497 -NT_187562.1 BestRefSeq exon 963419 963937 . + . Parent=rna157497 -NT_187562.1 BestRefSeq CDS 962614 962821 . + 0 Parent=rna157497 -NT_187562.1 BestRefSeq CDS 963419 963654 . + 2 Parent=rna157497 +# gffread v0.12.7 +# gffread -E -o output/ann_simple.gff sequence.gff3 +NM_141699.3 RefSeq gene 22 795 . + . ID=gene-Dmel_CG16905;gene_name=eloF +NM_141699.3 RefSeq CDS 22 795 . + 0 Parent=gene-Dmel_CG16905 diff --git a/src/gffread/test_data/output/annotation.gtf b/src/gffread/test_data/output/annotation.gtf index 674ddf7a..7e203137 100644 --- a/src/gffread/test_data/output/annotation.gtf +++ b/src/gffread/test_data/output/annotation.gtf @@ -1,82 +1,2 @@ -NT_187562.1 BestRefSeq transcript 411 68627 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 411 495 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 1995 2051 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 2602 2726 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 9665 9753 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 12115 12164 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 12577 12744 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 14174 14326 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 14664 14864 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 16634 16788 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 17438 17606 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 17880 17976 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 18710 18822 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 20083 20208 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 21352 21480 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 21736 21846 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 22191 22253 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 24448 24582 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 25379 25466 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 26264 26402 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 27215 27348 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 56500 56534 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 56627 56743 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 57445 57593 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 58211 58295 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 59473 59529 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 61493 61617 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 62682 62770 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 64510 64559 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 65149 65319 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq exon 67694 68627 . + . transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 411 495 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 1995 2051 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 2602 2726 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 9665 9753 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 12115 12164 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 12577 12744 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 14174 14326 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 14664 14864 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 16634 16788 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 17438 17606 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 17880 17976 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 18710 18822 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 20083 20208 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 21352 21480 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 21736 21846 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 22191 22253 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 24448 24582 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 25379 25466 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 26264 26402 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 27215 27348 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 56500 56534 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 56627 56743 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 57445 57593 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 58211 58295 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 59473 59529 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 61493 61617 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 62682 62770 . + 1 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 64510 64559 . + 2 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 65149 65319 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 BestRefSeq CDS 67694 67771 . + 0 transcript_id "rna157470"; gene_id "gene55473"; gene_name "MGAM"; -NT_187562.1 Curated Genomic transcript 214038 219958 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic exon 214038 214270 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic exon 214372 214511 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic exon 216955 217211 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic exon 217435 217573 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic exon 219568 219648 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic exon 219891 219958 . - . transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic CDS 214121 214270 . - 0 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic CDS 214372 214511 . - 2 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic CDS 216955 217211 . - 1 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic CDS 217435 217573 . - 2 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic CDS 219568 219607 . - 0 transcript_id "rna157473"; gene_id "gene55476"; gene_name "PRSS58"; -NT_187562.1 Curated Genomic transcript 230181 234148 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P"; -NT_187562.1 Curated Genomic exon 230181 231784 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P"; -NT_187562.1 Curated Genomic exon 233047 233200 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P"; -NT_187562.1 Curated Genomic exon 233852 234148 . - . transcript_id "rna157474"; gene_id "gene55477"; gene_name "TRY2P"; -NT_187562.1 BestRefSeq transcript 962573 963937 . + . transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34"; -NT_187562.1 BestRefSeq exon 962573 962821 . + . transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34"; -NT_187562.1 BestRefSeq exon 963419 963937 . + . transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34"; -NT_187562.1 BestRefSeq CDS 962614 962821 . + 0 transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34"; -NT_187562.1 BestRefSeq CDS 963419 963654 . + 2 transcript_id "rna157497"; gene_id "gene55581"; gene_name "C7orf34"; +NM_141699.3 RefSeq transcript 22 795 . + . transcript_id "gene-Dmel_CG16905"; gene_id "gene-Dmel_CG16905"; gene_name "eloF" +NM_141699.3 RefSeq CDS 22 795 . + 0 transcript_id "gene-Dmel_CG16905"; gene_name "eloF"; diff --git a/src/gffread/test_data/output/annotation.tbl b/src/gffread/test_data/output/annotation.tbl index d707d61a..15a5c0fd 100644 --- a/src/gffread/test_data/output/annotation.tbl +++ b/src/gffread/test_data/output/annotation.tbl @@ -1,4 +1 @@ -rna157470 NT_187562.1 411 68627 + 411-495,1995-2051,2602-2726,9665-9753,12115-12164,12577-12744,14174-14326,14664-14864,16634-16788,17438-17606,17880-17976,18710-18822,20083-20208,21352-21480,21736-21846,22191-22253,24448-24582,25379-25466,26264-26402,27215-27348,56500-56534,56627-56743,57445-57593,58211-58295,59473-59529,61493-61617,62682-62770,64510-64559,65149-65319,67694-68627 NM_004668.2 MGAM maltase-glucoamylase -rna157473 NT_187562.1 214038 219958 - 214038-214270,214372-214511,216955-217211,217435-217573,219568-219648,219891-219958 NM_001001317.4 PRSS58 protease%2C serine 58 -rna157474 NT_187562.1 230181 234148 - 230181-231784,233047-233200,233852-234148 NR_036483.2 TRY2P trypsinogen-like pseudogene -rna157497 NT_187562.1 962573 963937 + 962573-962821,963419-963937 NM_178829.4 C7orf34 chromosome 7 open reading frame 34 +gene-Dmel_CG16905 NM_141699.3 22 795 + 22-795 eloF eloF elongase F diff --git a/src/gffread/test_data/output/transcripts.fa b/src/gffread/test_data/output/transcripts.fa index ff0cde9f..889ebec9 100644 --- a/src/gffread/test_data/output/transcripts.fa +++ b/src/gffread/test_data/output/transcripts.fa @@ -1,119 +1,13 @@ ->rna157470 CDS=2-3343 -GTTCTACGAGGACAACAGCACTTGGGATGTGCACCAACAGTTCTTATGGGGGCCCGGCCTCCTCATCACT -CCAGTTCTGGATGAAGGTGCAGAGAAAGTGATGGCATATGTGCCTGATGCTGTCTGGTATGACTACGAGA -CTGGGAGCCAAGTGAGATGGAGGAAGCAAAAAGTCGAGATGGAACTTCCTGGAGACAAAATTGGACTTCA -CCTTCGAGGAGGCTACATCTTCCCCACACAGCAGCCAAATACAACCACTCTGGCCAGTCGAAAGAACCCT -CTTGGTCTTATCATTGCCCTAGATGAGAACAAAGAAGCAAAAGGAGAACTTTTCTGGGATAATGGGGAAA -CGAAGGATACTGTGGCCAATAAAGTGTATCTTTTATGTGAGTTTTCTGTCACTCAAAACCGCTTGGAGGT -GAATATTTCACAATCAACCTACAAGGACCCCAATAATTTAGCATTTAATGAGATTAAAATTCTTGGGACG -GAGGAACCTAGCAATGTTACAGTGAAACACAATGGTGTCCCAAGTCAGACTTCTCCTACAGTCACTTATG -ATTCTAACCTGAAGGTTGCCATTATCACAGATATTGATCTTCTCCTGGGAGAAGCATACACAGTGGAATG -GAGCATAAAGATAAGGGATGAAGAAAAAATAGACTGTTACCCTGATGAGAATGGTGCTTCTGCCGAAAAC -TGCACTGCCCGTGGCTGTATCTGGGAGGCATCCAATTCTTCTGGAGTCCCTTTTTGCTATTTTGTCAACG -ACCTATACTCTGTCAGTGATGTTCAGTATAATTCCCATGGGGCCACAGCTGACATCTCCTTAAAGTCTTC -CGTTTATGCCAATGCCTTCCCCTCCACACCCGTGAACCCCCTTCGCCTGGATGTCACTTACCATAAGAAT -GAAATGCTGCAGTTCAAGATTTATGATCCCAACAAGAATCGGTATGAAGTTCCAGTCCCTCTGAACATAC -CCAGCATGCCATCCAGCACCCCTGAGGGTCAACTCTATGATGTGCTCATTAAGAAGAATCCATTTGGGAT -TGAAATTCGCCGGAAGAGTACAGGCACTATAATTTGGGACTCTCAGCTCCTTGGCTTTACCTTCAGTGAC -ATGTTTATCCGCATCTCCACCCGCCTTCCCTCCAAGTACCTCTATGGCTTTGGGGAAACTGAGCACAGGT -CCTATAGGAGAGACTTGGAGTGGCACACTTGGGGGATGTTCTCCCGAGACCAGCCCCCAGGGTACAAGAA -GAATTCCTATGGTGTCCACCCCTACTACATGGGGCTGGAGGAGGACGGCAGTGCCCATGGAGTGCTCCTG -CTGAACAGCAATGCCATGGATGTGACGTTCCAGCCCCTGCCTGCCTTGACATACCGCACCACAGGGGGAG -TTCTGGACTTTTATGTGTTCTTGGGGCCGACTCCAGAGCTTGTCACCCAGCAGTACACTGAGTTGATTGG -CCGGCCTGTGATGGTACCTTACTGGTCTTTGGGGTTCCAGCTGTGTCGCTATGGCTACCAGAATGACTCT -GAGATCGCCAGCTTGTATGATGAGATGGTGGCTGCCCAGATCCCTTATGATGTGCAGTACTCAGACATCG -ACTACATGGAGCGGCAGCTGGACTTCACCCTCAGCCCCAAGTTTGCTGGGTTTCCAGCTCTGATCAATCG -CATGAAGGCTGATGGGATGCGGGTCATCCTCATTCTGGATCCAGCCATTTCTGGCAATGAGACACAGCCT -TATCCTGCCTTCACTCGGGGCGTGGAGGATGACGTCTTCATCAAATACCCAAATGATGGAGACATTGTCT -GGGGAAAGGTCTGGCCTGATTTTCCTGATGTTGTTGTGAATGGGTCTCTAGACTGGGACAGCCAAGTGGA -GCTATATCGAGCTTATGTGGCCTTCCCAGACTTTTTCCGTAATTCAACTGCCAAGTGGTGGAAGAGGGAA -ATAGAAGAACTATACAACAATCCACAGAATCCAGAGAGGAGCTTGAAGTTTGATGGCATGTGGATTGATA -TGAATGAACCATCAAGCTTCGTGAATGGGGCAGTTTCTCCAGGCTGCAGGGACGCCTCTCTGAACCACCC -TCCCTACATGCCACATTTGGAGTCCAGGGACAGGGGCCTGAGCAGCAAGACCCTTTGTATGGAGAGTCAG -CAGATCCTCCCAGACGGCTCCCTGGTGCAGCACTACAACGTGCACAACCTGTATGGGTGGTCCCAGACCA -GACCCACATACGAAGCCGTGCAGGAGGTGACGGGACAGCGAGGGGTCGTCATCACCCGCTCCACATTTCC -CTCTTCTGGCCGCTGGGCAGGACATTGGCTGGGAGACAACACGGCCGCATGGGATCAGCTGAAGAAGTCT -ATCATTGGCATGATGGAGTTCAGCCTCTTCGGCATATCCTATACGGGAGCAGATATCTGTGGGTTCTTTC -AAGATGCTGAATATGAGATGTGTGTTCGCTGGATGCAGCTGGGGGCCTTTTACCCCTTCTCAAGAAACCA -CAACACCATTGGGACCAGGAGACAAGACCCTGTGTCCTGGGATGTTGCTTTTGTGAATATTTCCAGAACT -GTCCTGCAGACCAGATACACCCTGTTGCCATATCTGTATACCTTGATGCATAAGGCCCACACGGAGGGCG -TCACTGTTGTGCGGCCTCTGCTCCATGAGTTTGTGTCAGACCAGGTGACATGGGACATAGACAGTCAGTT -CCTGCTGGGCCCAGCCTTCCTGGTCAGCCCTGTCCTGGAGCGTAATGCCAGAAATGTCACTGCATATTTC -CCTAGAGCCCGCTGGTATGATTACTACACGGGTGTGGATATTAATGCAAGAGGAGAGTGGAAGACCTTGC -CAGCCCCTCTTGACCACATTAATCTTCATGTCCGTGGGGGCTACATCCTGCCCTGGCAAGAGCCTGCACT -GAACACCCACTTAAGCCGCCAGAAATTCATGGGCTTCAAAATTGCCTTGGATGATGAAGGAACTGCTGGG -GGCTGGCTCTTCTGGGATGATGGGCAAAGCATTGATACCTATGGGAAAGGACTCTATTACTTGGCCAGCT -TTTCTGCCAGCCAGAATACGATGCAAAGCCATATAATTTTCAACAATTACATCACTGGTACAAATCCTTT -GAAACTGGGCTACATTGAAATCTGGGGAGTGGGCAGTGTCCCCGTTACCAGTGTCAGCATCTCTGTGAGT -GGCATGGTCATAACACCCTCCTTCAACAATGACCCCACGACACAGGTATTAAGCATCGATGTGACTGACA -GAAACATCAGCCTACATAATTTTACTTCATTGACGTGGATAAGCACTCTGTGAATTTTTACAGCAAGATT -CTAACTAACTATGAATGACTTTGAAACTACTTATACTTCATACTCATAAAAATTATTGTGTGTTGCTAAT -TTGTTCATACCCACTATTGGTGAAATATTTCTGTTAATTTTGTTATATGTTTTTTGTGTGAACCCTAAAG -GTTAAACCTTAGCCCTGTGGGATAGGCAGTTAGGGAGGTGTGGAAAATCTATGCATTACCTTAATGTCTC -TGTGTGGTTAGTATGGTAGTGACTGTTCATCATATGACATTTACTGAAGATGAACTGGGTCCATGATGAA -GTGTGTGTATGTCCACGTTTGTAATCATAGAATGGACCCCATTCTTTTGTTAAATACACAAGAGAAAGCT -TTCTGTGACAGTTCCAGGTCTTGAAGCTAATCAGCATCTCAAGAAAGTATCCAGAAAGAACATCTGCTAG -TTGGTTATAGGCGGTGGGAGGAATAATATACCTAATTGGTTATAGGTGGGGGGAGCATGATAAGCAAAGA -AAAGGCAAACACAAGGAAAGATCAGATGAAACAGAAGATGATAGTAAAAGTGATCCTAAGTAAGAACATA -ATGTAAAATTGTCAGCAGCCTCATGGGGAGGAAAAAGGAAGAGTCAACTCACTTGAAGAAGAGGGTCTTG -AGAAATCCTTAGCATAAAGGGCTACTGGTGAGATTGAGATCTGAGCAGGCAAAGCTCAAAAGAGAGTTTG -GAGGTTAAAAATAATTTATTTTTGCAGTAGTGTGCTTTGAAATGTGTAAATCTTATTTCTAATGTATACA -ACCACATTTCACATAAAAATATGCAATTTATATGCCAGATAAAAATAAAACAAGTGAATTTGCAAGTGA ->rna157473 CDS=110-835 -AAGGCTGGCAAAAAGGAGACCAGACAGGAGGCGTCTGTAGAGATATCATGAACTTCAACTTAGCTTTGTT -TTCCAGAGACTGGAGCTAAACTGGGCTTTCAACATCATCATGAAGTTTATCCTCCTCTGGGCTCTCTTGA -ATCTGACTGTTGCTTTGGCCTTTAATCCAGATTACACAGTCAGCTCCACTCCCCCTTACTTGGTCTATTT -GAAATCTGACTACTTGCCCTGCGCTGGAGTCCTGATCCACCCGCTTTGGGTGATCACAGCTGCACACTGC -AATTTACCAAAGCTTCGGGTGATATTGGGGGTTACAATCCCAGCAGACTCTAATGAAAAGCATCTGCAAG -TGATTGGCTATGAGAAGATGATTCATCATCCACACTTCTCAGTCACTTCTATTGATCATGACATCATGCT -AATCAAGCTGAAAACAGAGGCTGAACTCAATGACTATGTGAAATTAGCCAACCTGCCCTACCAAACTATC -TCTGAAAATACCATGTGCTCTGTCTCTACCTGGAGCTACAATGTGTGTGATATCTACAAAGAGCCCGATT -CACTGCAAACTGTGAACATCTCTGTAATCTCCAAGCCTCAGTGTCGCGATGCCTATAAAACCTACAACAT -CACGGAAAATATGCTGTGTGTGGGCATTGTGCCAGGAAGGAGGCAGCCCTGCAAGGAAGTTTCTGCTGCC -CCGGCAATCTGCAATGGGATGCTTCAAGGAATCCTGTCTTTTGCGGATGGATGTGTTTTGAGAGCCGATG -TTGGCATCTATGCCAAAATTTTTTACTATATACCCTGGATTGAAAATGTAATCCAAAATAACTGAGCTGT -GGCAGTTGTGGACCATATGACACAGCTTGTCCCCATCGTTCACCTTTAGAATTAAATATAAATTAACTCC -TCACATTG ->rna157474 -AAGATTTGTAGGATAGGAAAGGAGGCACCAGGACCTCGGCTTGGTCTGTGCCACTCTCTTTCACCCACAT -CAGAGACGGCTCCCAGCAATGCCGTATATATTTGGGTCTTGATAAGGACTCATGTGAAGACTGCTCTCCT -CTCATCTGTCTTCTGCCCATAATCTCCACAGCTGTTCTTGGCAAGGACACTTGGACTAAAGATTGGGACA -GTTTACAGCTTGATCAACTTGTCTGTTCCTGGCGAGATCTGTCTGCCACGAAGGGTTTTCTCATCGTCGC -TCTCTTGAGTGCAGCTGGAGCTGCTCTGGCTAAAGACTCTAAAGATAATCAGGATCTGACATATAATTTT -ACTATTCCTTACATGGTCTATCTTCAGTCCTTCCCAGAACCCTGCGTGGGGTCTCTCATTCACCTTGACT -GGGTATTGACAGCTGCCCACTGCCCCTTACCTGTTGAAATTCGACTGGGAGTTTCTCAACCTAGCATCAC -AAACAAGAAACAACAGATATGAAACTATTCATCGATTGTGACCTACCCTGACTTTGATGCAAAATCCTTG -AACAATGACCTAATGATGATCAAACTATCAACACCTGCTTCACTCAACCCCCATGTGGGGACTATAGCCA -TAGCCTTGGAACCCATTCCATTTAATGAATCCTGCTTTATTCCAACCTGGACCTGGAATGAATATAAAAA -CCGTATGTGTTGGATGCTGTGTTCTTTATGCTGAGGAGGTTGGAGCTGGGAGAAAATGTAGAGGGATGTC -CCTAATGGAAATCGCTGCTATGAGACAATGTTGTCCACCATAGAGGGATTCCTGCCAGGGACCCAAACCA -CCATCACTGGAGGTCAAAGATAAAATTCAAAGAGACTGATAAAGGGCAGCCACTCTGACACCAACAAATG -GAGCAGACAGTAATTTTCATATCTCTAGAATATTGAAGACAAGAGTCCAGAGAACTGGATTGAAATACAG -GTGGGAGAAATAGGAGTCCTCGTGCACAAAGACAGTATGAAGACAGCCTGCGGGAGGGTGACACCCATTG -ATAGTTGCTGAAAGCCCATCACACAATGGAAGAACAGTGGGGGAAATGAACATAAAATATATGTGATGCG -GTGAAGCAGCAGAAGATGCTTCCTACACCTTCCGCAGAGAACCCAGTTATGTGCAGAGAACCCTGGGAGA -GAAGAGAATGGTTGAGGGAGTACCTGTCCTGTCACCAGAGCTGGTTCCCTGCACAGAGTGCTACTCATAC -CCACTGCTTCTTCCCAGAGTTCCCATTTAGGGGTGGAGACTTTGGTGGTTCGTAGGGAAATTGATTATTA -TGACATAATCAGTTGTTATCTCCCTGGTCTGACATGCCTCACAAGAAAAACAGTGGCAGCCCATGTTGTA -AGCACACACCCATCTCCCCTGTGAGCTGAAATTGGGATGAGCAGAAAGATCCGGTCTCAGCAGGAGTCTG -TCTAATAAAGAACACCTGTGAAGAAAACTACTTTCTAATAAGTTTGTTGGTCCTTGTTGGCTGTTACTCC -ATTATTCTCTCCTTAGGTATTGTTGAGGATGAGGATACCACCACTTGATATCTGAAATGAAATGACAAAT -GTAAATCTATTGCTTATTACAGCAAGTCAGAGCAATGCTGACCAGGCACCATCCCACTAAGCAGAATGGA -GTGCTGATTAGCATAAGGGGTTTGTGGGAAGCTTGGAGTTGAGGGATTGGAGGACTTTCAGAGGTATGAA -TGGGTTCCATCATCTGTTGAGGCCTGGCTACTTCAGGGGGATTGCTGTTGATTGATTGGTGCTCTGAGGC -GTGTTTTTCAGAGGACCTCTGATCCTGATCAGCTGGTTTTAAGAAGTGAAGGCTGACATTGATTATACTG -CAAAAGTTATGTTCACTCAAGCAAGTTGTCCTGGATCTATTAATCAGGTTTAAATCTCTCTCTGGTGGCT -ACAGAACAATGCACGTGCTCTGTAAAAGCAAATAAAAGTGGAATTTTAGAAGTCAGTCTTTGGTAAAAAA -CAGAAATAAAAAGCAGAAACTATTA ->rna157497 CDS=42-485 -GGAGAAAATGCCCCTGGAAAGGGTTAAGGGCCAGGACAGGAATGGGGCAGGAGGTGCACGGATCCTGCTG -GGCACTGGGAGCAGGGGGCGGCCAAAGGCAGTGGGTGGGCAGGTCCATGCCTCCCCTGGCCCCCCAGCTC -TGCAGGGCAGTGTTCCTGGTTCCTATCTTGCTGCTGCTGCAGGTGAAGCCTCTGAACGGGAGCCCAGGCC -CCAAAGATGGGAGCCAGACAGAGAAAACGCCCTCTGCAGACCAGAATCAAGAACAGTTCGAAGAGCACTT -TGTGGCCTCCTCAGTGGGTGAGATGTGGCAGGTGGTGGACATGGCCCAGCAGGAAGAAGACCAGTCGTCC -AAGACGGCAGCTGTTCACAAGCACTCTTTCCACCTCAGCTTCTGCTTTAGTCTGGCCAGTGTCATGGTTT -TCTCAGGAGGGCCATTGAGGCGGACATTCCCAAATATCCAACTCTGCTTCATGCTCACTCACTGACCCTC -CCTCCCTCCTGGGCTCCAGGTCACAACTCCCAAAGGAGATGCAGGCATGGCTCTCTGCCTCTGATCACCA -TCACTGTATCTCAAGGTTCAGCAGCAGAGATACCAGTTGCCATCAGTGCTAACTGACTGCCTCTCCAGGT -TCGGAGTTTCATCTCCCAGGGCCAGAGACAGCAGACCCACATCCTTCTCTCCCACACCTCTCCTGGTTTT -GTTCAGGACAGCAGATTAGAGGCAGGAGGCAATGACAATAAAATAACGATAAAATCCTGAGAACAATT +>gene-Dmel_CG16905 CDS=1-774 +ATGTTCGCTCCGATAGATCCTGTAAAGATACCCGTTGTAAGCAATCCATGGATAACCATGGGCACATTGA +TTGGCTATCTGCTGTTTGTGCTCAAGCTGGGCCCCAAAATCATGGAGCACCGAAAGCCCTTCCATTTGAA +TGGCGTCATCAGGATCTACAACATATTCCAGATCCTTTACAATGGTCTAATACTCGTTTTAGGAGTTCAC +TTCCTGTTTGTCCTGAAAGCCTACCAAATCAGTTGCATTGTTAGCCTGCCGATGGATCACAAATATAAGG +ATAGAGAGCGTTTGATTTGCACTTTGTACCTGGTGAACAAATTCGTAGACCTTGTGGAAACCATTTTCTT +TGTGCTCCGCAAAAAGGACAGACAGATATCCTTCCTGCACGTCTTCCATCATTTTGCGATGGCATTTTTT +GGATATCTCTACTACTGCTTCCACGGATACGGTGGCGTTGCCTTTCCACAGTGCCTGCTAAACACCGCCG +TCCACGTGATTATGTACGCCTACTACTATCTATCCTCGATCAGCAAGGAGGTGCAGAGAAGTCTCTGGTG +GAAGAAATACATCACAATTGCTCAGCTGGTCCAGTTCGCCATTATTCTGCTCCACTGTACCATCACGCTG +GCACAGCCCAACTGCGCGGTCAACAGACCCTTGACCTACGGATGCGGATCGCTTTCAGCGTTTTTTGCAG +TGATATTTAGCCAATTTTATTACCACAACTACATAAAGCCAGGAAAGAAGTCAGCGAAACAAAACAAAAA +TTAA diff --git a/src/gffread/test_data/script.sh b/src/gffread/test_data/script.sh index daa1907b..0c6e725c 100755 --- a/src/gffread/test_data/script.sh +++ b/src/gffread/test_data/script.sh @@ -1,3 +1,4 @@ +#!/bin/bash # clone repo if [ ! -d /tmp/gffread_source ]; then diff --git a/src/gffread/test_data/sequence.fasta b/src/gffread/test_data/sequence.fasta new file mode 100644 index 00000000..31ec0f04 --- /dev/null +++ b/src/gffread/test_data/sequence.fasta @@ -0,0 +1,16 @@ +>NM_141699.3 Drosophila melanogaster elongase F (eloF), mRNA +CACAACTCGATTAGATTCGCCATGTTCGCTCCGATAGATCCTGTAAAGATACCCGTTGTAAGCAATCCAT +GGATAACCATGGGCACATTGATTGGCTATCTGCTGTTTGTGCTCAAGCTGGGCCCCAAAATCATGGAGCA +CCGAAAGCCCTTCCATTTGAATGGCGTCATCAGGATCTACAACATATTCCAGATCCTTTACAATGGTCTA +ATACTCGTTTTAGGAGTTCACTTCCTGTTTGTCCTGAAAGCCTACCAAATCAGTTGCATTGTTAGCCTGC +CGATGGATCACAAATATAAGGATAGAGAGCGTTTGATTTGCACTTTGTACCTGGTGAACAAATTCGTAGA +CCTTGTGGAAACCATTTTCTTTGTGCTCCGCAAAAAGGACAGACAGATATCCTTCCTGCACGTCTTCCAT +CATTTTGCGATGGCATTTTTTGGATATCTCTACTACTGCTTCCACGGATACGGTGGCGTTGCCTTTCCAC +AGTGCCTGCTAAACACCGCCGTCCACGTGATTATGTACGCCTACTACTATCTATCCTCGATCAGCAAGGA +GGTGCAGAGAAGTCTCTGGTGGAAGAAATACATCACAATTGCTCAGCTGGTCCAGTTCGCCATTATTCTG +CTCCACTGTACCATCACGCTGGCACAGCCCAACTGCGCGGTCAACAGACCCTTGACCTACGGATGCGGAT +CGCTTTCAGCGTTTTTTGCAGTGATATTTAGCCAATTTTATTACCACAACTACATAAAGCCAGGAAAGAA +GTCAGCGAAACAAAACAAAAATTAACTAAATTTAAACTAAATCATGAGTACAAAGCCTAAAGATTCGTGA +AGCAACAATAGCCACAGCCTATTTTTGAATATTTCATATATGATTTTATGGGGTAAATGAATTAAAAAAC +ATTTGTTTTCTTGGCGTCAAACT + diff --git a/src/gffread/test_data/sequence.gff3 b/src/gffread/test_data/sequence.gff3 new file mode 100644 index 00000000..c6a77a7a --- /dev/null +++ b/src/gffread/test_data/sequence.gff3 @@ -0,0 +1,9 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +##sequence-region NM_141699.3 1 933 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7227 +NM_141699.3 RefSeq region 1 933 . + . ID=NM_141699.3:1..933;Dbxref=taxon:7227;Name=3R;chromosome=3R;gbkey=Src;genome=chromosome;genotype=y[1]%3B Gr22b[1] Gr22d[1] cn[1] CG33964[R4.2] bw[1] sp[1]%3B LysC[1] MstProx[1] GstD5[1] Rh6[1];mol_type=mRNA +NM_141699.3 RefSeq gene 1 933 . + . ID=gene-Dmel_CG16905;Dbxref=FLYBASE:FBgn0037762,GeneID:41211;Name=eloF;cyt_map=85E10-85E10;description=elongase F;gbkey=Gene;gen_map=3-49 cM;gene=eloF;gene_synonym=CG16905,Dmel\CG16905,EloF;locus_tag=Dmel_CG16905 +NM_141699.3 RefSeq CDS 22 795 . + 0 ID=cds-NP_649956.1;Parent=gene-Dmel_CG16905;Dbxref=FLYBASE:FBpp0081622,GeneID:41211,GenBank:NP_649956.1,FLYBASE:FBgn0037762;Name=NP_649956.1;gbkey=CDS;gene=eloF;locus_tag=Dmel_CG16905;orig_transcript_id=gnl|FlyBase|CG16905-RA;product=elongase F;protein_id=NP_649956.1 +