-
Notifications
You must be signed in to change notification settings - Fork 0
/
miRcounts.py
133 lines (105 loc) · 4.17 KB
/
miRcounts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#! /usr/bin/env python
'''
This will take raw small RNA reads in fastq format and output counts of
the known miRNA.
This script requires Python V2, Biopython, and Blast+.
usage: ./miRcounts.py known_miRNA RAWREADS.fastq output.txt
'''
import sys as sys
import time
import tempfile
from Bio import SeqIO
from StringIO import StringIO
import subprocess as SP
start_time = time.time()
knownMiRNA=open(sys.argv[1],'r')
fileinput=open(sys.argv[2],'r')
fileoutput=open(sys.argv[3],'w')
fastaDB = tempfile.NamedTemporaryFile()
knownMiRNAlen = open("TEMPmiRlenFILE.txt", 'w+')
##############################################################################
# First add the length of each known miRNA sequence to the name because
# blast wont check for exact length match, so later we have to check
# against this 'real' length.
miRNA = []
for line in knownMiRNA:
miRNA.append(line)
names = miRNA[::2]
seqs = miRNA[1::2]
numPrefix = []
for line in seqs:
numPrefix.append(str(len(line)-1) + '~')
justNames = []
for line in names:
justNames.append(line.split('>')[1])
# for this part:
namesNums = [] # The new miRNA names with length prefix attached
namesDB = [] # will need this list of names for the counter in the last part
for x in range(len(names)):
namesDB.append(numPrefix[x] + justNames[x].rstrip())
namesNums.append('>' + numPrefix[x] + justNames[x])
final = []
for x in range(len(names)):
final.append(namesNums[x])
final.append(seqs[x])
# put the miRNA DB with number prefix into a temporary file
for part in range(len(final)):
knownMiRNAlen.write(final[part])
for i in knownMiRNAlen:
print i
# Convert the raw fastq file into a fasta file
print "*** First we convert the fastq file into a fasta file ***"
num_reads = SeqIO.convert(fileinput.name, "fastq", fastaDB.name, "fasta")
##############################################################################
# Run BLAST
# To make the fasta file into a blast database
print "*** Making fasta file into Blast database. ***"
makedb = SP.Popen(['makeblastdb', '-in', fastaDB.name, '-dbtype', 'nucl'])
makedb.wait()
# Running the blast (this is a special blast with parameters optimised
# for short reads):
print "*** blast database is ready, now the blast starts ***"
blastseqs = SP.Popen(['blastn', '-query', knownMiRNAlen.name, '-db', fastaDB.name, '-evalue', '0.01', '-task', 'blastn-short', '-perc_identity', '100', '-strand', 'plus', '-max_target_seqs', '10000000', '-outfmt', "6 qseqid evalue length sseqid nident"], stdout=SP.PIPE)
# Since used PIPE to store the blastn output, retrieve it with .communicate()
blastOutput = blastseqs.communicate()
outlist = blastOutput[0].split('\n')
# because the above separating makes the last item in the list, '',
# we remove it:
outlist.pop()
##############################################################################
# Count up the miRNA found with the blast (only keeping identical lengths):
# make a dictionary to hold the miRNA names and counts:
counts = {}
for name in namesDB:
name = name.rsplit('~')[1]
counts[name] = 0
# Make a dictionary of the maximum match length for each read
# This is so that I only get the longest possible for each similar miRNA
# which are just one shorter dont get counted twice.
outlist2 = {}
for line in outlist:
seq = line.rsplit('\t')
readID = seq[3]
readMatchLength = seq[2]
if readID not in outlist2:
outlist2[readID] = readMatchLength
elif readMatchLength > outlist2[readID]:
outlist2[readID] = readMatchLength
# now go through the blast output and count up the unique hits
for line in outlist:
seq = line.rsplit('\t')
name = seq[0].rsplit('~')[1]
readID = seq[3]
expect_length = int(seq[0].rsplit('~')[0])
if expect_length == int(seq[2]) == int(outlist2[readID]):
counts[name] += 1
# write my count dictionary into the output file
sumCounts = 0
for key, value in counts.items():
fileoutput.write("%s\t%s\n" %(key,value))
sumCounts += value
print("--- RUNTIME: %s minutes ---" % ((time.time() - start_time)/60))
print "*** Counting complete, check out the output file! ***"
print("Total known miRNA found: %s" % sumCounts)
knownMiRNAlen.close()
fastaDB.close()