Skip to content

Commit

Permalink
add gff_intersect.py
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Jun 25, 2015
1 parent f77c795 commit f7cf494
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 19 deletions.
6 changes: 0 additions & 6 deletions .directory

This file was deleted.

6 changes: 3 additions & 3 deletions file_formats/bam2gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
strand, start, end = '-', read2['start'], read1['end']

sys.stdout.write('\t'.join(
[read.query_name, 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.',
read1['ref']]) + "\n")
[read1['ref'], 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.',
read.query_name]) + "\n")

stats['paired'] += 1

Expand All @@ -57,4 +57,4 @@
continue
stats['unpaired'] += 1

sys.stderr.write('summary: {}\n'.format(stats))
sys.stderr.write('{} summary: {}\n'.format(args.bamfile, stats))
24 changes: 14 additions & 10 deletions file_formats/extract_cds_by_gff.py → file_formats/gff2fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
import gzip
from collections import defaultdict
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

parser = argparse.ArgumentParser(description="extract_cds_by_gff")
parser.add_argument('-t', '--type', type=str,
default='CDS', help='gene type [CDS]')
parser.add_argument('-t', '--type', type=str,
default='CDS', help='gene type. "." for any types. [CDS]')
parser.add_argument('-us', '--up-stream', type=int,
default=0, help='up stream length [0]')
parser.add_argument('-ds', '--down-stream', type=int,
default=0, help='down stream length [0]')
default=0, help='up stream length [0]')
parser.add_argument('-ds', '--down-stream', type=int,
default=0, help='down stream length [0]')
parser.add_argument('gff_file', type=str, help='gff file')
parser.add_argument('fasta_file', type=str, help='fasta file')
args = parser.parse_args()
Expand All @@ -32,9 +33,9 @@ def read_gff_file(file):
continue
name = data[0]
gene = dict()
gene['type'], gene['start'], gene['end'], gene['strand']
= data[2], int(data[3]), int(data[4]), data[6]
gene['type'], gene['start'], gene['end'], gene['strand'] = data[2], int(data[3]), int(data[4]), data[6]
genes[name].append(gene)

return genes


Expand All @@ -43,11 +44,12 @@ def read_gff_file(file):
fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r')
for record in SeqIO.parse(fh, 'fasta'):
name, genome = record.id, record.seq
if record.id not in genes:

if name not in genes:
continue

for gene in genes[name]:
if gene['type'].lower() != args.type.lower():
if args.type != '.' and gene['type'].lower() != args.type.lower():
continue
seq = ''
if gene['strand'] == '+':
Expand All @@ -58,5 +60,7 @@ def read_gff_file(file):
s = gene['start'] - args.down_stream - 1
s = 0 if s < 0 else s
seq = genome[s:gene['end'] + args.up_stream].reverse_complement()
print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq))
SeqIO.write(
SeqRecord(seq, id='{}_{}..{}..{}'.format(name, gene['start'], gene['end'], gene['strand']), description=''),
sys.stdout, 'fasta')
fh.close()
93 changes: 93 additions & 0 deletions file_formats/gff_intersect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python
# https://github.com/shenwei356/bio_scripts

from __future__ import print_function, division
import argparse
import sys
from collections import defaultdict, Counter
import operator
from bx.intervals.intersection import Intersecter, Interval

parser = argparse.ArgumentParser(description="gff intersect",
epilog="https://github.com/shenwei356/bio_scripts")

parser.add_argument('query', type=str, help='gff file b (query)')
parser.add_argument('subject', type=str, help='gff file a (subject)')

parser.add_argument("-v", "--verbose", help='verbosely print information',
action="count", default=0)

args = parser.parse_args()

sys.stderr.write('building tree\n')
trees = dict()
with open(args.subject) as fh:
genome = ''
for line in fh:
data = line.rstrip().split('\t')
g, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8]
if g != genome:
genome = g
trees[genome] = Intersecter()

if strand == '-': # complement strand
start, end = -end, -start
trees[genome].add_interval(Interval(start, end, value=product))

sys.stderr.write('querying\n')
with open(args.query) as fh:
for line in fh:
data = line.rstrip().split('\t')
genome, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8]

if genome not in trees:
continue

overlaps = trees[genome].find(start, end)
if len(overlaps) == 0:
continue

overlap_data, stats = list(), Counter()
for x in overlaps:
s, e = x.start, x.end
if s < 0: # complement strand
s, e = -x.end, -x.start

overlap, t = 0, ''
if s <= start:
if e >= end:
# start ======== end
# s ------------- e
overlap = end - start + 1
t = 'cover'
else:
# start ======== end
# s ------ e
overlap = e - start + 1
t = 'overlap.upstream' if strand == '+' else 'overlap.downstream'
else:
if e >= end:
# start ======== end
# s ------ e
overlap = end - s + 1
t = 'overlap.downstream' if strand == '+' else 'overlap.upstream'
else:
# start ======== end
# s --- e
overlap = e - s + 1
t = 'embed'
if strand == '+':
frame = abs(s - start) % 3
else:
frame = abs(e - end) % 3

stats[t] += 1
overlap_data.append(
[str(i) for i in [x.value, s, e, overlap, round(100 * overlap / (end - start + 1), 1), t, frame]])

sys.stdout.write('>' + line)
sys.stdout.write('summary: {}\n'.format(stats))
sys.stdout.write('\t'.join(['name', 'start', 'end', 'overlap', 'overlap%', 'type', 'frame']) + '\n')
for overlap in sorted(overlap_data, key=lambda o: (o[5], o[6], -float(o[4]))):
sys.stdout.write('\t'.join(overlap) + '\n')
sys.stdout.write('\n')
File renamed without changes.

0 comments on commit f7cf494

Please sign in to comment.