-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNovelVirusSeq.py
223 lines (199 loc) · 9.54 KB
/
NovelVirusSeq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import subprocess
import argparse
import glob
import timeit
import sys
from tools import annotate_assembled
start = timeit.default_timer()
# IMPORTANT NOTE- THIS NEEDS TO BE RUN WITH PROKKA IN -c MODE
# NOTE- You also need to have a copy of tbl2asn in your home directory
# NOTE- Needs info.csv, template.sbt, annotation.cmt
# python virus_annotate filename basecutoff runname
# TODO: Instead of pulling from the info.csv file we should pull directly from the master spreadsheet
parser = argparse.ArgumentParser(description='Annotate a set of assemblies with prokka, HHpred, and blast')
parser.add_argument('filename', help='The input file, must be a .fasta can contain as many contigs as you please')
parser.add_argument('basecutoff', type=int, help='Minimum length of contig, all assemblies of length less than this '
'will not be processed Passing a zero at this argument signfies that '
'you would like to run this on a segmented genome that you have '
'assembled into one file')
parser.add_argument('runname', help='Give your run a unique name! Useful to track the results later or if you want to '
'do more than one run back to back')
args = parser.parse_args()
min_base_cutoff = args.basecutoff
filename = args.filename
run_name = args.runname
fasta_file = open(filename)
info_list = []
dna_list = []
dna_string = ''
WORKING_DIRECTORY = '/home/ryan/Virus-Annotate/'
HHBLIT_DB = '/home/ryan/uniprot20_2015_06/uniprot20_2015_06'
BLAST_DB = '/home/ryan/db/nr'
# This hacks the preassembled fasta file
if min_base_cutoff == 0:
annotate_assembled(filename, run_name)
sys.exit()
# This reads the assembly file and saves each contig greater than min base cuttoff
for line in fasta_file:
if line[0] == '>':
info_list.append(line[:-1])
if dna_string != '':
dna_list.append(dna_string)
dna_string = ''
else:
dna_string += line[:-1]
dna_list.append(dna_string)
pruned_info_list = []
pruned_dna_list = []
count = 0
for entry in info_list:
if int(entry.split('_')[3]) >= min_base_cutoff:
pruned_info_list.append(entry)
pruned_dna_list.append(dna_list[count])
count += 1
node_list = []
subprocess.call('mkdir ' + WORKING_DIRECTORY + run_name, shell=True)
run_name = run_name + '/'
# Create a new folder for each contig saved in the last step, and save the contig in it as a .fsa with
# Info taken from info.csv (a tab seperated list of information for genbank)
for x in range(0, len(pruned_dna_list)):
# Write the file that prokka will use
one = open(WORKING_DIRECTORY + 'temp_contig_dir/contig_' + str(x).zfill(2) + '.fasta', 'w')
one.write(pruned_info_list[x] + '\n')
one.write(pruned_dna_list[x])
one.close()
# Write the .fsa file for tbl2asn to use, complete with the comment information
subprocess.call('mkdir ' + WORKING_DIRECTORY + run_name + 'contig' + str(x).zfill(2), shell=True)
two = open(WORKING_DIRECTORY + run_name + 'contig' + str(x).zfill(2) + '/contig' + str(x).zfill(2) + '.fsa', 'w')
data = pruned_info_list[x].split('_')
newline = data[0] + '_' + data[1] + '_' + data[2] + '_' + data[3]
node_list.append(newline)
# Read all the comment information in and then create new header line.
info = open(WORKING_DIRECTORY + 'info.csv')
count = 1
for line in info:
if count == 1:
title_list = line.split(',')
if count == 2:
data_list = line.split(',')
count += 1
for index in range(0, len(title_list)):
newline = newline + ' [' + title_list[index].rstrip('\n') + '=' + data_list[index].rstrip('\n') + ']'
newline = newline + ' [strain=' + run_name + str(x) + ']'
two.write(newline + '\n')
two.write(pruned_dna_list[x])
two.close()
info.close()
inc = 0
# Run Prokka in viral mode on each of the putative genomes and save em' all into the prokka_output folder
for contig in sorted(glob.glob(WORKING_DIRECTORY + 'temp_contig_dir/*.fasta')):
subprocess.call('prokka ' + contig + ' --outdir ' + WORKING_DIRECTORY + 'prokka_output/ --prefix contig' +
str(inc).zfill(2) + ' --kingdom Virus --force --compliant', shell=True)
inc += 1
# move and modify the .tbl files from prokka into our final directory
count = 0
for tbl in sorted(glob.glob(WORKING_DIRECTORY + 'prokka_output/*.tbl')):
readnext=True
original = open(tbl)
newfile = open(WORKING_DIRECTORY + run_name + 'contig' + str(count).zfill(2) + '/contig' + str(count).zfill(2) + '.tbl', 'w')
for line in original:
if line[0] == '>' or line[0] == '<':
newfile.write('>Feature ' + node_list[count][1:] + '\n')
elif line[0] != '\t':
if abs(int(line.split()[1]) - int(line.split()[0])) >= 240 and line.split()[2] != 'gene':
newfile.write(line)
readnext = True
else:
readnext = False
elif line.split()[0] == 'product' and readnext:
newfile.write(line)
readnext = True
newfile.close()
original.close()
count += 1
# Go through all the prokka outputs line by line and push the 'genes' that have 'hypothetical protein' through hhblits
# And blastp NOTE- Blastp is running in -remote mode so this needs to get changed during final deployment
count = 0
for prokka_output in sorted(glob.glob(WORKING_DIRECTORY + 'prokka_output/*.faa')):
dna_string = ''
header_list = []
gene_list = []
for line in open(prokka_output, 'r'):
if line[0] == '>':
header_list.append(line[:-1])
if dna_string != '':
gene_list.append(dna_string)
dna_string = ''
else:
dna_string += line[:-1]
gene_list.append(dna_string)
gene_num = 0
for x in range(0, len(header_list)):
# Creates a file the contig folder named gene_num.hhr only hypothetical proteins are displayed
if len(gene_list[x]) >= 80:
gene_num += 1
print(header_list[x])
print(gene_list[x])
d = open('/home/ryan/temp_gene.part', 'w')
d.write(header_list[x] + '\n')
d.write(gene_list[x])
d.close()
# Actually call hhblits and blastp, I'm not even sure if blastp is correct here, and it is remote, but the
# heavy lifting for this step is done already
subprocess.call('hhblits -i /home/ryan/temp_gene.part -o ' + WORKING_DIRECTORY + run_name + 'contig' +
str(count).zfill(2) + '/hh_result' + str(gene_num).zfill(2) +
'.hhr -d ' + HHBLIT_DB + ' -Z 10 -B 10', shell=True)
#subprocess.call('blastp -query /home/ryan/temp_gene.dna -db nr -remote | tee ' + WORKING_DIRECTORY +
#run_name + 'contig' + str(count).zfill(2) + '/blast_result' + str(gene_num).zfill(2) + '.txt', shell=True)
# TODO: make this not remote whenever possible
# TODO: also doen't like proteins not starting with MET
# TODO: comment this out for more accurate timing results
count += 1
count = 0
for prokka_output in sorted(glob.glob(WORKING_DIRECTORY + 'prokka_output/*.ffn')):
dna_string = ''
header_list = []
gene_list = []
for line in open(prokka_output, 'r'):
if line[0] == '>' or line[0] == '<':
header_list.append(line[:-1])
if dna_string != '':
gene_list.append(dna_string)
dna_string = ''
else:
dna_string += line[:-1]
gene_list.append(dna_string)
gene_num = 0
print()
for x in range(0, len(header_list)):
if len(gene_list[x]) >= 240:
gene_num += 1
print(header_list[x])
print(gene_list[x])
d = open('/home/ryan/temp_gene.dna', 'w')
d.write(header_list[x] + '\n')
d.write(gene_list[x])
name = 'hypothetical protein'
d.close()
subprocess.call('blastn -query /home/ryan/temp_gene.dna -db nt -remote | tee ' + WORKING_DIRECTORY +
run_name + 'contig' + str(count).zfill(2) + '/blast_result' + str(gene_num).zfill(2) + '.txt', shell=True)
count += 1
# TODO: Pause here and allow manual annotation? or at least provide a function to go through and re-tbl2asn everything
# Cleanup step
for x in range(0, count):
subprocess.call('cp ' + WORKING_DIRECTORY + 'template.sbt ' + WORKING_DIRECTORY + run_name + 'contig' +
str(x).zfill(2), shell=True)
subprocess.call('cp ' + WORKING_DIRECTORY + 'assembly.cmt ' + WORKING_DIRECTORY + run_name + 'contig' +
str(x).zfill(2), shell=True)
# This means you need to have a copy of tbl2asn in your home directory
subprocess.call('./tbl2asn -p ' + WORKING_DIRECTORY + run_name + 'contig' + str(x).zfill(2) +
' -t ' + WORKING_DIRECTORY + run_name + 'contig' + str(x).zfill(2) +
'/template.sbt -Y ' + WORKING_DIRECTORY + run_name + 'contig' + str(x).zfill(2) + '/assembly.cmt -V vb',
shell=True)
subprocess.call('rm -rf ' + WORKING_DIRECTORY + 'prokka_output/* ' + WORKING_DIRECTORY + 'temp_contig_dir/*',
shell=True)
stop = timeit.default_timer()
print('Annotations should be in folders named contig# all unknown genes will have blastp and hhpred outputs with lower '
'numbers corresponding to earlier unknown genes in the prokka output')
print('total run time')
print(stop - start)