add gff_intersect.py

shenwei356 · Jun 25, 2015 · f7cf494 · f7cf494
1 parent f77c795
commit f7cf494
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 19 deletions.
diff --git a/.directory b/.directory
diff --git a/file_formats/bam2gff.py b/file_formats/bam2gff.py
@@ -43,8 +43,8 @@
                 strand, start, end = '-', read2['start'], read1['end']
 
             sys.stdout.write('\t'.join(
-                [read.query_name, 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.',
-                 read1['ref']]) + "\n")
+                [read1['ref'], 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.',
+                 read.query_name]) + "\n")
 
             stats['paired'] += 1
 
@@ -57,4 +57,4 @@
         continue
     stats['unpaired'] += 1
 
-sys.stderr.write('summary: {}\n'.format(stats))
+sys.stderr.write('{} summary: {}\n'.format(args.bamfile, stats))
diff --git a/file_formats/extract_cds_by_gff.py → file_formats/gff2fa.py b/file_formats/extract_cds_by_gff.py → file_formats/gff2fa.py
@@ -7,14 +7,15 @@
 import gzip
 from collections import defaultdict
 from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
 
 parser = argparse.ArgumentParser(description="extract_cds_by_gff")
-parser.add_argument('-t', '--type', type=str, 
-					default='CDS', help='gene type [CDS]')
+parser.add_argument('-t', '--type', type=str,
+                    default='CDS', help='gene type. "." for any types. [CDS]')
 parser.add_argument('-us', '--up-stream', type=int,
-					default=0, help='up stream length [0]')
-parser.add_argument('-ds', '--down-stream', type=int, 
-					default=0, help='down stream length [0]')
+                    default=0, help='up stream length [0]')
+parser.add_argument('-ds', '--down-stream', type=int,
+                    default=0, help='down stream length [0]')
 parser.add_argument('gff_file', type=str, help='gff file')
 parser.add_argument('fasta_file', type=str, help='fasta file')
 args = parser.parse_args()
@@ -32,9 +33,9 @@ def read_gff_file(file):
                 continue
             name = data[0]
             gene = dict()
-            gene['type'], gene['start'], gene['end'], gene['strand'] 
-				= data[2], int(data[3]), int(data[4]), data[6]
+            gene['type'], gene['start'], gene['end'], gene['strand'] = data[2], int(data[3]), int(data[4]), data[6]
             genes[name].append(gene)
+
     return genes
 
 
@@ -43,11 +44,12 @@ def read_gff_file(file):
 fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r')
 for record in SeqIO.parse(fh, 'fasta'):
     name, genome = record.id, record.seq
-    if record.id not in genes:
+
+    if name not in genes:
         continue
 
     for gene in genes[name]:
-        if gene['type'].lower() != args.type.lower():
+        if args.type != '.' and gene['type'].lower() != args.type.lower():
             continue
         seq = ''
         if gene['strand'] == '+':
@@ -58,5 +60,7 @@ def read_gff_file(file):
             s = gene['start'] - args.down_stream - 1
             s = 0 if s < 0 else s
             seq = genome[s:gene['end'] + args.up_stream].reverse_complement()
-        print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq))
+        SeqIO.write(
+            SeqRecord(seq, id='{}_{}..{}..{}'.format(name, gene['start'], gene['end'], gene['strand']), description=''),
+            sys.stdout, 'fasta')
 fh.close()
diff --git a/file_formats/gff_intersect.py b/file_formats/gff_intersect.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# https://github.com/shenwei356/bio_scripts
+
+from __future__ import print_function, division
+import argparse
+import sys
+from collections import defaultdict, Counter
+import operator
+from bx.intervals.intersection import Intersecter, Interval
+
+parser = argparse.ArgumentParser(description="gff intersect",
+                                 epilog="https://github.com/shenwei356/bio_scripts")
+
+parser.add_argument('query', type=str, help='gff file b (query)')
+parser.add_argument('subject', type=str, help='gff file a (subject)')
+
+parser.add_argument("-v", "--verbose", help='verbosely print information',
+                    action="count", default=0)
+
+args = parser.parse_args()
+
+sys.stderr.write('building tree\n')
+trees = dict()
+with open(args.subject) as fh:
+    genome = ''
+    for line in fh:
+        data = line.rstrip().split('\t')
+        g, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8]
+        if g != genome:
+            genome = g
+            trees[genome] = Intersecter()
+
+        if strand == '-':  # complement strand
+            start, end = -end, -start
+        trees[genome].add_interval(Interval(start, end, value=product))
+
+sys.stderr.write('querying\n')
+with open(args.query) as fh:
+    for line in fh:
+        data = line.rstrip().split('\t')
+        genome, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8]
+
+        if genome not in trees:
+            continue
+
+        overlaps = trees[genome].find(start, end)
+        if len(overlaps) == 0:
+            continue
+
+        overlap_data, stats = list(), Counter()
+        for x in overlaps:
+            s, e = x.start, x.end
+            if s < 0:  # complement strand
+                s, e = -x.end, -x.start
+
+            overlap, t = 0, ''
+            if s <= start:
+                if e >= end:
+                    #   start ======== end
+                    #     s ------------- e
+                    overlap = end - start + 1
+                    t = 'cover'
+                else:
+                    #  start ======== end
+                    #   s ------ e
+                    overlap = e - start + 1
+                    t = 'overlap.upstream' if strand == '+' else 'overlap.downstream'
+            else:
+                if e >= end:
+                    #   start ======== end
+                    #           s ------ e
+                    overlap = end - s + 1
+                    t = 'overlap.downstream' if strand == '+' else 'overlap.upstream'
+                else:
+                    #   start ======== end
+                    #          s --- e
+                    overlap = e - s + 1
+                    t = 'embed'
+            if strand == '+':
+                frame = abs(s - start) % 3
+            else:
+                frame = abs(e - end) % 3
+
+            stats[t] += 1
+            overlap_data.append(
+                [str(i) for i in [x.value, s, e, overlap, round(100 * overlap / (end - start + 1), 1), t, frame]])
+
+        sys.stdout.write('>' + line)
+        sys.stdout.write('summary: {}\n'.format(stats))
+        sys.stdout.write('\t'.join(['name', 'start', 'end', 'overlap', 'overlap%', 'type', 'frame']) + '\n')
+        for overlap in sorted(overlap_data, key=lambda o: (o[5], o[6], -float(o[4]))):
+            sys.stdout.write('\t'.join(overlap) + '\n')
+        sys.stdout.write('\n')
diff --git a/file_formats/extract_cds_by_gff.pl → for_education/extract_cds_by_gff.pl b/file_formats/extract_cds_by_gff.pl → for_education/extract_cds_by_gff.pl