From f7cf4946dfa976a33f5a3e718dad2ae6b1e4e70d Mon Sep 17 00:00:00 2001 From: shenwei356 Date: Thu, 25 Jun 2015 16:25:06 +0800 Subject: [PATCH] add gff_intersect.py --- .directory | 6 -- file_formats/bam2gff.py | 6 +- .../{extract_cds_by_gff.py => gff2fa.py} | 24 +++-- file_formats/gff_intersect.py | 93 +++++++++++++++++++ .../extract_cds_by_gff.pl | 0 5 files changed, 110 insertions(+), 19 deletions(-) delete mode 100644 .directory rename file_formats/{extract_cds_by_gff.py => gff2fa.py} (71%) create mode 100755 file_formats/gff_intersect.py rename {file_formats => for_education}/extract_cds_by_gff.pl (100%) diff --git a/.directory b/.directory deleted file mode 100644 index 109944c..0000000 --- a/.directory +++ /dev/null @@ -1,6 +0,0 @@ -[Dolphin] -SortOrder=1 -SortRole=date -Timestamp=2015,6,18,21,52,44 -Version=3 -ViewMode=1 diff --git a/file_formats/bam2gff.py b/file_formats/bam2gff.py index c3eaac8..52aefa5 100755 --- a/file_formats/bam2gff.py +++ b/file_formats/bam2gff.py @@ -43,8 +43,8 @@ strand, start, end = '-', read2['start'], read1['end'] sys.stdout.write('\t'.join( - [read.query_name, 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.', - read1['ref']]) + "\n") + [read1['ref'], 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.', + read.query_name]) + "\n") stats['paired'] += 1 @@ -57,4 +57,4 @@ continue stats['unpaired'] += 1 -sys.stderr.write('summary: {}\n'.format(stats)) +sys.stderr.write('{} summary: {}\n'.format(args.bamfile, stats)) diff --git a/file_formats/extract_cds_by_gff.py b/file_formats/gff2fa.py similarity index 71% rename from file_formats/extract_cds_by_gff.py rename to file_formats/gff2fa.py index 6715ad4..d88d37e 100755 --- a/file_formats/extract_cds_by_gff.py +++ b/file_formats/gff2fa.py @@ -7,14 +7,15 @@ import gzip from collections import defaultdict from Bio import SeqIO +from Bio.SeqRecord import SeqRecord parser = argparse.ArgumentParser(description="extract_cds_by_gff") -parser.add_argument('-t', '--type', type=str, - default='CDS', help='gene type [CDS]') +parser.add_argument('-t', '--type', type=str, + default='CDS', help='gene type. "." for any types. [CDS]') parser.add_argument('-us', '--up-stream', type=int, - default=0, help='up stream length [0]') -parser.add_argument('-ds', '--down-stream', type=int, - default=0, help='down stream length [0]') + default=0, help='up stream length [0]') +parser.add_argument('-ds', '--down-stream', type=int, + default=0, help='down stream length [0]') parser.add_argument('gff_file', type=str, help='gff file') parser.add_argument('fasta_file', type=str, help='fasta file') args = parser.parse_args() @@ -32,9 +33,9 @@ def read_gff_file(file): continue name = data[0] gene = dict() - gene['type'], gene['start'], gene['end'], gene['strand'] - = data[2], int(data[3]), int(data[4]), data[6] + gene['type'], gene['start'], gene['end'], gene['strand'] = data[2], int(data[3]), int(data[4]), data[6] genes[name].append(gene) + return genes @@ -43,11 +44,12 @@ def read_gff_file(file): fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r') for record in SeqIO.parse(fh, 'fasta'): name, genome = record.id, record.seq - if record.id not in genes: + + if name not in genes: continue for gene in genes[name]: - if gene['type'].lower() != args.type.lower(): + if args.type != '.' and gene['type'].lower() != args.type.lower(): continue seq = '' if gene['strand'] == '+': @@ -58,5 +60,7 @@ def read_gff_file(file): s = gene['start'] - args.down_stream - 1 s = 0 if s < 0 else s seq = genome[s:gene['end'] + args.up_stream].reverse_complement() - print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq)) + SeqIO.write( + SeqRecord(seq, id='{}_{}..{}..{}'.format(name, gene['start'], gene['end'], gene['strand']), description=''), + sys.stdout, 'fasta') fh.close() diff --git a/file_formats/gff_intersect.py b/file_formats/gff_intersect.py new file mode 100755 index 0000000..e111fd2 --- /dev/null +++ b/file_formats/gff_intersect.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# https://github.com/shenwei356/bio_scripts + +from __future__ import print_function, division +import argparse +import sys +from collections import defaultdict, Counter +import operator +from bx.intervals.intersection import Intersecter, Interval + +parser = argparse.ArgumentParser(description="gff intersect", + epilog="https://github.com/shenwei356/bio_scripts") + +parser.add_argument('query', type=str, help='gff file b (query)') +parser.add_argument('subject', type=str, help='gff file a (subject)') + +parser.add_argument("-v", "--verbose", help='verbosely print information', + action="count", default=0) + +args = parser.parse_args() + +sys.stderr.write('building tree\n') +trees = dict() +with open(args.subject) as fh: + genome = '' + for line in fh: + data = line.rstrip().split('\t') + g, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8] + if g != genome: + genome = g + trees[genome] = Intersecter() + + if strand == '-': # complement strand + start, end = -end, -start + trees[genome].add_interval(Interval(start, end, value=product)) + +sys.stderr.write('querying\n') +with open(args.query) as fh: + for line in fh: + data = line.rstrip().split('\t') + genome, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8] + + if genome not in trees: + continue + + overlaps = trees[genome].find(start, end) + if len(overlaps) == 0: + continue + + overlap_data, stats = list(), Counter() + for x in overlaps: + s, e = x.start, x.end + if s < 0: # complement strand + s, e = -x.end, -x.start + + overlap, t = 0, '' + if s <= start: + if e >= end: + # start ======== end + # s ------------- e + overlap = end - start + 1 + t = 'cover' + else: + # start ======== end + # s ------ e + overlap = e - start + 1 + t = 'overlap.upstream' if strand == '+' else 'overlap.downstream' + else: + if e >= end: + # start ======== end + # s ------ e + overlap = end - s + 1 + t = 'overlap.downstream' if strand == '+' else 'overlap.upstream' + else: + # start ======== end + # s --- e + overlap = e - s + 1 + t = 'embed' + if strand == '+': + frame = abs(s - start) % 3 + else: + frame = abs(e - end) % 3 + + stats[t] += 1 + overlap_data.append( + [str(i) for i in [x.value, s, e, overlap, round(100 * overlap / (end - start + 1), 1), t, frame]]) + + sys.stdout.write('>' + line) + sys.stdout.write('summary: {}\n'.format(stats)) + sys.stdout.write('\t'.join(['name', 'start', 'end', 'overlap', 'overlap%', 'type', 'frame']) + '\n') + for overlap in sorted(overlap_data, key=lambda o: (o[5], o[6], -float(o[4]))): + sys.stdout.write('\t'.join(overlap) + '\n') + sys.stdout.write('\n') \ No newline at end of file diff --git a/file_formats/extract_cds_by_gff.pl b/for_education/extract_cds_by_gff.pl similarity index 100% rename from file_formats/extract_cds_by_gff.pl rename to for_education/extract_cds_by_gff.pl