From 1b313be181e1de53687794573be8fac58a60def5 Mon Sep 17 00:00:00 2001 From: shenwei356 Date: Wed, 24 Jun 2015 22:37:09 +0800 Subject: [PATCH] bam2gff.py --- .../add_annotations_to_myva.pl | 0 file_formats/bam2gff.py | 56 +++++++++++++++++++ .../extract_cds_by_gff.pl | 0 .../extract_cds_by_gff.py | 0 ...extract_cds_from_glimmer_predict_result.pl | 14 +++-- .../extract_features_from_genbank_file.py | 11 ++-- .../extract_sequence_from_genbank_file.pl | 0 7 files changed, 72 insertions(+), 9 deletions(-) rename {seq_format => file_formats}/add_annotations_to_myva.pl (100%) create mode 100755 file_formats/bam2gff.py rename {seq_format => file_formats}/extract_cds_by_gff.pl (100%) rename {seq_format => file_formats}/extract_cds_by_gff.py (100%) rename {seq_format => file_formats}/extract_cds_from_glimmer_predict_result.pl (67%) rename {seq_format => file_formats}/extract_features_from_genbank_file.py (91%) rename {seq_format => file_formats}/extract_sequence_from_genbank_file.pl (100%) diff --git a/seq_format/add_annotations_to_myva.pl b/file_formats/add_annotations_to_myva.pl similarity index 100% rename from seq_format/add_annotations_to_myva.pl rename to file_formats/add_annotations_to_myva.pl diff --git a/file_formats/bam2gff.py b/file_formats/bam2gff.py new file mode 100755 index 0000000..39f5e2d --- /dev/null +++ b/file_formats/bam2gff.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# https://github.com/shenwei356/bio_scripts + +import argparse +import sys +from collections import defaultdict +import pysam + +parser = argparse.ArgumentParser(description="bam2gff") + +parser.add_argument('bamfile', type=str, help='bam file') + +parser.add_argument("-v", "--verbose", help='verbosely print information', + action="count", default=0) + +args = parser.parse_args() + +pairs = defaultdict(dict) + + +def pairs2gff(pairs): + todelete = list() + for query, pair in pairs.items(): + if len(pair.keys()) < 2: + continue + read1, read2 = pair['read1'], pair['read2'] + + if not read1['reverse']: + strand, start, end = '+', read1['start'], read2['end'] + else: + strand, start, end = '-', read2['start'], read1['end'] + + sys.stdout.write('\t'.join( + [query, 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.', + read1['ref']]) + "\n") + todelete.append(query) + for query in todelete: + del pairs[query] + + +samfile = pysam.AlignmentFile(args.bamfile, "rb") +for read in samfile.fetch(): + if read.is_proper_pair and not read.is_secondary: + if read.reference_length < read.query_length * 0.75: # full match + continue + + pairs[read.query_name]['read1' if read.is_read1 else 'read2'] = {'start': read.reference_start, + 'end': read.reference_end, + 'ref': samfile.getrname(read.reference_id), + 'reverse': read.is_reverse} + if len(pairs) > 1000: + pairs2gff(pairs) + +samfile.close() + +pairs2gff(pairs) diff --git a/seq_format/extract_cds_by_gff.pl b/file_formats/extract_cds_by_gff.pl similarity index 100% rename from seq_format/extract_cds_by_gff.pl rename to file_formats/extract_cds_by_gff.pl diff --git a/seq_format/extract_cds_by_gff.py b/file_formats/extract_cds_by_gff.py similarity index 100% rename from seq_format/extract_cds_by_gff.py rename to file_formats/extract_cds_by_gff.py diff --git a/seq_format/extract_cds_from_glimmer_predict_result.pl b/file_formats/extract_cds_from_glimmer_predict_result.pl similarity index 67% rename from seq_format/extract_cds_from_glimmer_predict_result.pl rename to file_formats/extract_cds_from_glimmer_predict_result.pl index 089818c..d2194f1 100755 --- a/seq_format/extract_cds_from_glimmer_predict_result.pl +++ b/file_formats/extract_cds_from_glimmer_predict_result.pl @@ -8,7 +8,7 @@ $0 = basename($0); my $usage = qq( - usage: $0 [gtf] + usage: $0 [gff] ); die $usage unless @ARGV == 2 or @ARGV == 3; @@ -19,15 +19,19 @@ my $genome = ( values %{ read_sequence_from_fasta_file($seqfile) } )[0]; my @data = (); -my ( $name, $a, $b, $frame, $seq ); +my ( $genome, $name, $a, $b, $frame, $seq ); open my $fh, '<', $prfile or die "fail to open file: $prfile\n"; while (<$fh>) { + $genome = $1 if /^>(.+)/; @data = split /\s+/, $_; - next unless scalar(@data) >= 9; + next unless scalar(@data) == 5; ( $name, $a, $b, $frame ) = @data; + next unless $a =~ /^\d+$/; - if ( defined $gtf ) { - printf "%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s\n", + if ( $gtf eq 'gff' ) { + my $strand = $frame > 0 ? '+' : '-'; + printf "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", + $name, 'glimmer', 'CDS', $a, $b, '.', $strand, '.', $genome; } else { if ( $frame > 0 ) { diff --git a/seq_format/extract_features_from_genbank_file.py b/file_formats/extract_features_from_genbank_file.py similarity index 91% rename from seq_format/extract_features_from_genbank_file.py rename to file_formats/extract_features_from_genbank_file.py index 451d5af..4ceb6d5 100755 --- a/seq_format/extract_features_from_genbank_file.py +++ b/file_formats/extract_features_from_genbank_file.py @@ -1,11 +1,8 @@ #!/usr/bin/env python # https://github.com/shenwei356/bio_scripts -from __future__ import print_function import sys -import os import argparse -import logging from Bio import SeqIO from Bio.Seq import Seq @@ -19,7 +16,7 @@ def parse_args(): parser.add_argument('gbkfile', type=str, help='Genbank file') parser.add_argument('-t', '--type', type=str, default='CDS', help='Feature type (CDS tRNA). Multiple values should be separated by comma.') - outfmt_choices = ['fasta', 'gtf'] + outfmt_choices = ['fasta', 'gtf', 'gff'] parser.add_argument('-f', '--outfmt', type=str, default='fasta', help='Out format, fasta or gtf') @@ -103,3 +100,9 @@ def parse_args(): sys.stdout.write('\t'.join( [record.id, 'genbank', f.type, str(start + 1), str(end), '.', strand, str(frame), attribute]) + "\n") + + elif args.outfmt == 'gff': + frame = int(qualifiers['codon_start'][0]) - 1 if 'codon_start' in qualifiers else 0 + sys.stdout.write('\t'.join( + [record.id, 'genbank', f.type, str(start + 1), str(end), '.', strand, str(frame), + product]) + "\n") diff --git a/seq_format/extract_sequence_from_genbank_file.pl b/file_formats/extract_sequence_from_genbank_file.pl similarity index 100% rename from seq_format/extract_sequence_from_genbank_file.pl rename to file_formats/extract_sequence_from_genbank_file.pl