Skip to content

Commit

Permalink
bam2gff.py
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Jun 24, 2015
1 parent da2636f commit 1b313be
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 9 deletions.
File renamed without changes.
56 changes: 56 additions & 0 deletions file_formats/bam2gff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
# https://github.com/shenwei356/bio_scripts

import argparse
import sys
from collections import defaultdict
import pysam

parser = argparse.ArgumentParser(description="bam2gff")

parser.add_argument('bamfile', type=str, help='bam file')

parser.add_argument("-v", "--verbose", help='verbosely print information',
action="count", default=0)

args = parser.parse_args()

pairs = defaultdict(dict)


def pairs2gff(pairs):
todelete = list()
for query, pair in pairs.items():
if len(pair.keys()) < 2:
continue
read1, read2 = pair['read1'], pair['read2']

if not read1['reverse']:
strand, start, end = '+', read1['start'], read2['end']
else:
strand, start, end = '-', read2['start'], read1['end']

sys.stdout.write('\t'.join(
[query, 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.',
read1['ref']]) + "\n")
todelete.append(query)
for query in todelete:
del pairs[query]


samfile = pysam.AlignmentFile(args.bamfile, "rb")
for read in samfile.fetch():
if read.is_proper_pair and not read.is_secondary:
if read.reference_length < read.query_length * 0.75: # full match
continue

pairs[read.query_name]['read1' if read.is_read1 else 'read2'] = {'start': read.reference_start,
'end': read.reference_end,
'ref': samfile.getrname(read.reference_id),
'reverse': read.is_reverse}
if len(pairs) > 1000:
pairs2gff(pairs)

samfile.close()

pairs2gff(pairs)
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

$0 = basename($0);
my $usage = qq(
usage: $0 <glimmer .predict file> <genome file> [gtf]
usage: $0 <glimmer .predict file> <genome file> [gff]
);
die $usage unless @ARGV == 2 or @ARGV == 3;
Expand All @@ -19,15 +19,19 @@
my $genome = ( values %{ read_sequence_from_fasta_file($seqfile) } )[0];

my @data = ();
my ( $name, $a, $b, $frame, $seq );
my ( $genome, $name, $a, $b, $frame, $seq );
open my $fh, '<', $prfile or die "fail to open file: $prfile\n";
while (<$fh>) {
$genome = $1 if /^>(.+)/;
@data = split /\s+/, $_;
next unless scalar(@data) >= 9;
next unless scalar(@data) == 5;
( $name, $a, $b, $frame ) = @data;
next unless $a =~ /^\d+$/;

if ( defined $gtf ) {
printf "%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s\n",
if ( $gtf eq 'gff' ) {
my $strand = $frame > 0 ? '+' : '-';
printf "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
$name, 'glimmer', 'CDS', $a, $b, '.', $strand, '.', $genome;
}
else {
if ( $frame > 0 ) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
#!/usr/bin/env python
# https://github.com/shenwei356/bio_scripts

from __future__ import print_function
import sys
import os
import argparse
import logging

from Bio import SeqIO
from Bio.Seq import Seq
Expand All @@ -19,7 +16,7 @@ def parse_args():
parser.add_argument('gbkfile', type=str, help='Genbank file')
parser.add_argument('-t', '--type', type=str, default='CDS',
help='Feature type (CDS tRNA). Multiple values should be separated by comma.')
outfmt_choices = ['fasta', 'gtf']
outfmt_choices = ['fasta', 'gtf', 'gff']
parser.add_argument('-f', '--outfmt', type=str, default='fasta',
help='Out format, fasta or gtf')

Expand Down Expand Up @@ -103,3 +100,9 @@ def parse_args():
sys.stdout.write('\t'.join(
[record.id, 'genbank', f.type, str(start + 1), str(end), '.', strand, str(frame),
attribute]) + "\n")

elif args.outfmt == 'gff':
frame = int(qualifiers['codon_start'][0]) - 1 if 'codon_start' in qualifiers else 0
sys.stdout.write('\t'.join(
[record.id, 'genbank', f.type, str(start + 1), str(end), '.', strand, str(frame),
product]) + "\n")

0 comments on commit 1b313be

Please sign in to comment.