-
Notifications
You must be signed in to change notification settings - Fork 6
/
make_operon_query.py.bak
executable file
·666 lines (537 loc) · 32.9 KB
/
make_operon_query.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
#!/usr/bin/env python
# TODO: remove the last of the operon references. this has not been done yet due to a piece of untested code.
# until i get it teched out, i will not mess with teh variable names.
# This code needs to be cleaned up a lot... there is a lot of testing garbage that is no longer needed!
# It however, does work for the purposes of the paper submission.
from multiprocessing import Pool
import time
import os
import sys
#import simplejson as json
import json
import argparse
from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
# Copyright(C) 2015 David Ream
# Released under GPL version 3 licence. http://www.gnu.org/licenses/lgpl.html
# Do not remove this comment
# This exists to make the main function easier to read. It contains code to run the argument parser, and does nothing else.
def parser_code():
parser = argparse.ArgumentParser(description='Convert a gene block file into a BLAST query using one or more reference organisms.')
parser.add_argument("-i", "--infolder", dest="infolder", metavar="DIRECTORY", default='./genomes/',
help="Folder containing all genbank files for use by the program. The refrence genbank file must reside here.")
#parser.add_argument("-G", "--genbank_directory", dest="genbank_directory", metavar="DIRECTORY", default='./genomes/',
# help="Folder containing all genbank files for use by the program.")
parser.add_argument("-o", "--outfile", dest="outfile", metavar="FILE", default='./gene_block_query.fa',
help="Resulting BLAST query file the is derived from a gene block file and refrence organism(s).")
parser.add_argument("-b", "--gene_block_file", dest="gene_block_file", metavar="FILE", default='gene_block_names_and_genes.txt',
help="File which contains gene block information for use in gene block queries. The file format is gene_block_name followed by the constituent gene names, tab delineated.")
parser.add_argument("-n", "--num_proc", dest="num_proc", metavar="INT", default = os.sysconf("SC_NPROCESSORS_CONF"), type=int,
help="Number of processors that you want this script to run on. The default is every CPU that the system has.")
parser.add_argument("-r", "--refrence", dest="refrence", metavar="STRING", default = 'NC_000913',
help="Accession number of the refrence organism. This information is used to determine the product type of each gene (RNA/Protein), a necessary piece of information to classify the gene blocks that are under investigation.")
parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
help="Suppresses most program text outputs.")
return parser.parse_args()
def check_options(parsed_args):
# I'm not issuing a message that hey, this file is there and will be overwritten, the program will just overwrite.
# TODO: check that the folder that this is saved in exists
if os.path.isdir(parsed_args.infolder):
infolder = parsed_args.infolder
else:
print(("The infolder %s does not exist.") % parsed_args.infolder)
sys.exit()
outfile = parsed_args.outfile
# Check the gene block file
if os.path.exists(parsed_args.gene_block_file):
gene_block_file = parsed_args.gene_block_file
else:
print(("The gene_block file %s does not exist.") % parsed_args.gene_block_file)
sys.exit()
# section of code that deals determining the number of CPU cores that will be used by the program
if parsed_args.num_proc > os.sysconf("SC_NPROCESSORS_CONF"):
num_proc = os.sysconf("SC_NPROCESSORS_CONF")
elif parsed_args.num_proc < 1:
num_proc = 1
else:
num_proc = int(parsed_args.num_proc)
# I am not checking this option here, just leave it as default for the time being.
refrence_list = [parsed_args.refrence]
quiet = parsed_args.quiet
return infolder, outfile, gene_block_file, num_proc, refrence_list, quiet
#this function will return all of the files that are in a directory. os.walk is recursive traversal.
def returnRecursiveDirFiles(root_dir):
result = []
for path, dir_name, flist in os.walk(root_dir):
for f in flist:
fname = os.path.join(path, f)
if os.path.isfile(fname):
result.append(fname)
return result
# convert a file of format : gene_block name then a list of the full names of the genes within that gene block
# into a dictionary that can be easily accessed or filtered later.
def parse_gene_block_file(fname):
result = {}
for line in [i.strip().split('\t') for i in open(fname).readlines()]:
result.update({line[0]: line[1:]})
return result
# This function creates a dictionary indexed by locus from the input genbank file
# and for my purposes now, it will index genes based on their annotation in genbank
# seq_type will allow us to determine the type of sequence returned in for the dict. default
# will be amino acid because this is a lot less noisy.
def return_genbank_dict(gb_file, key = 'annotation', seq_type = 'amino_acid'):
"""Overview: This function will return a dictionary generated from a genbank file with key value supplied by caller.
Returns: A dictionary created by the supplied genbank file (gb_file) indexed off the key value supplied.
Default: The deafult key is locus, and this is generally the most useful key type since it is garanteed to be
unique within the genbank file. This condition is not necessarily true for any other attribute.
"""
result = {}
seq_record = next(SeqIO.parse(open(gb_file), "genbank"))
accession = seq_record.annotations['accessions'][0].split('.')[0]
common_name = seq_record.annotations['organism'].replace(' ', '_')
result.update({'accession': accession})
result.update({'common_name': common_name})
cnt = 0
# loop over the genbank file
unk_cnt = 1
for fnum, feature in enumerate(seq_record.features):
# here i simply check the gene coding type, and identify them in a way that can be used later.
if feature.type == 'CDS' or feature.type == 'ncRNA' or feature.type == 'tRNA' or feature.type == 'mRNA' or feature.type == 'rRNA':
start = feature.location.start
stop = feature.location.end
#print start, stop
strand = feature.strand
synonyms = 'NONE'
'''
try:
gene = feature.qualifiers['gene'][0]
except:
gene = 'unknown'
'''
# this line might be wrong, just trying to get rid of an unneccessary try/except clause
if 'gene' in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
else:
gene = 'unknown'
if 'gene_synonym' in feature.qualifiers:
synonym_list = feature.qualifiers['gene_synonym'][0].replace(' ', '').split(';')
synonyms = ':'.join(synonym_list)
try:
locus = feature.qualifiers['locus_tag'][0]
except:
try:
locus = feature.qualifiers['gene'][0]
except:
locus = ''
print ('No locus associated. This should never be invoked meaning you are proper fracked. (The gbk file has an error).')
try:
seq = feature.qualifiers['translation']
seq_type = 'Protein'
except:
cnt = cnt + 1
seq = seq_record.seq[start:stop]
seq_type = feature.type
if feature.type == 'CDS':
seq_type = 'Pseudo_Gene'
# attempt to fix an error
if strand == 1:
seq = seq.translate()
else:
seq = seq.reverse_complement().translate()
gc = "%2.1f" % GC(seq_record.seq[start:stop])
# Debugging something odd
#print feature.qualifiers['gene_synonym']
#method = "exact"
if key == 'locus':
result.update({locus: (locus, gene, seq, seq_type, synonyms)})
elif key == 'annotation':
if gene == 'unknown':
new_gene = 'unknown_' + str(unk_cnt)
header = '|'.join([accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc])
result.update({new_gene: [header, ''.join(seq)]})
unk_cnt +=1
else:
header = '|'.join([accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc])
result.update({gene: [header, ''.join(seq)]})
try:
for syn in synonym_list:
result.update({syn: [header, ''.join(seq)]})
except:
pass
#print 'The number of non-protein regions in %s is: %i.' % (common_name, cnt)
return result
# This function will allow me to do the main work of make_gene_block_fasta, but allow parallel
# processing. I will have to make a parallel array, which will take some time to learn. i
# will keep this stub for later to implement.
def parallel_gene_block_fasta(genome):
organism = genome.split('/')[-1].split('.')[0]
organism_dict_for_recovery = {}
org_dict = return_genbank_dict(genome)
organism_dict_for_recovery.update({organism: org_dict})
return (organism, org_dict)
# This function will make a BLAST query from the parsed gene block file. The default behavior of the function is
# to make a query file from all genes in all organisms that are annotated. Later the results will be sorted based
# the needs of the programmer. The defaulted variables allow a single gene block to be chosen individually. The program
# will also store the results of this function in a folder titled blast_query_files, in the recovery folder.
def make_gene_block_fasta2(gene_list, genbank_list, num_processors, folder, ref_list):
pool = Pool(processes = num_processors)
organism_dict_for_recovery = dict(pool.map(parallel_gene_block_fasta, genbank_list))
protein_match = []
rna_match = []
pseudogene_match = []
missing_list = []
refrence_prot = []
refrence_rna = []
# This list should be updated should other types of RNAs be annotated later as parts of (siRNA comes to mind).
RNA_codes = ['rRNA', 'tRNA', 'ncRNA']
for org in list(organism_dict_for_recovery.keys()):
for gene in gene_list:
if gene in list(organism_dict_for_recovery[org].keys()):
if organism_dict_for_recovery[org][gene][0].split('|')[7] == 'Protein':
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
protein_match.append(outseq)
if org in ref_list:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
refrence_prot.append(outseq)
elif organism_dict_for_recovery[org][gene][0].split('|')[7] in RNA_codes:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
rna_match.append(outseq)
if org in ref_list:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
refrence_rna.append(outseq)
elif organism_dict_for_recovery[org][gene][0].split('|')[7] == 'Pseudo_Gene':
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
pseudogene_match.append(outseq)
if org in ref_list:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
refrence_prot.append(outseq)
else:
print(("There was an error in function make_gene_block_fasta2, from script make_operon_query.py for", organism_dict_for_recovery[org][gene][0]))
else: # The gene is missing, we will look for it at a later stage in the program
item = '\t'.join([org, gene])
missing_list.append(item)
handle = open(folder + 'protein_matches.fa', 'w')
SeqIO.write(protein_match, handle,"fasta")
handle.close()
handle = open(folder + 'rna_matches.fa', 'w')
SeqIO.write(rna_match, handle,"fasta")
handle.close()
handle = open(folder + 'pseudogene_matches.fa', 'w')
SeqIO.write(pseudogene_match, handle,"fasta")
handle.close()
handle = open(folder + 'missing_gene_block_genes.txt', 'w')
handle.write('\n'.join(missing_list))
handle.close()
# The goal here is to return the refrence fasta files, so that we can do self homolog dict, which is needed to detect
# fusion proteins later.
ref_prot_outfile = folder + 'refrence_prot.fa'
ref_rna_outfile = folder + 'refrence_rna.fa'
#print "len(gene_list)", len(gene_list)
handle = open(ref_prot_outfile, 'w')
SeqIO.write(refrence_prot, handle, "fasta")
#print "len(refrence_prot)", len(refrence_prot)
handle.close()
handle = open(folder + 'refrence_rna.fa', 'w')
SeqIO.write(refrence_rna, ref_rna_outfile, "fasta")
#print "len(refrence_rna)", len(refrence_rna)
handle.close()
return ref_prot_outfile, ref_rna_outfile, organism_dict_for_recovery
# This function will make a BLAST query from the parsed gene_block file. The default behavior of the function is
# to make a query file from all genes in all organisms that are annotated. Later the results will be sorted based
# the needs of the programmer. The defaulted variables allow a single gene_block to be chosen individually. The program
# will also store the results of this function in a folder titled blast_query_files, in the recovery folder.
def make_gene_block_fasta2(gene_list, genbank_list, num_processors, outfile, ref_list):
refrence_file_path_list = [i for i in genbank_list if os.path.basename(i).split('.')[0] in ref_list]
#print "refrence_file_path_list", refrence_file_path_list
pool = Pool(processes = num_processors)
organism_dict_for_recovery = dict(pool.map(parallel_gene_block_fasta, refrence_file_path_list))
protein_match = []
rna_match = []
pseudogene_match = []
missing_list = []
refrence_prot = []
refrence_rna = []
# This list should be updated should other types of RNAs be annotated later as parts of (siRNA comes to mind).
RNA_codes = ['rRNA', 'tRNA', 'ncRNA']
for org in list(organism_dict_for_recovery.keys()):
for gene in gene_list:
if gene in list(organism_dict_for_recovery[org].keys()):
if organism_dict_for_recovery[org][gene][0].split('|')[7] == 'Protein':
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
protein_match.append(outseq)
if org in ref_list:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
refrence_prot.append(outseq)
elif organism_dict_for_recovery[org][gene][0].split('|')[7] in RNA_codes:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
rna_match.append(outseq)
if org in ref_list:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
refrence_rna.append(outseq)
elif organism_dict_for_recovery[org][gene][0].split('|')[7] == 'Pseudo_Gene':
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
pseudogene_match.append(outseq)
if org in ref_list:
outseq = SeqRecord(Seq(organism_dict_for_recovery[org][gene][1]),
id=organism_dict_for_recovery[org][gene][0], description = '')
refrence_prot.append(outseq)
else:
print(("There was an error in function make_gene_block_fasta2 from script make_operon_query.py in", organism_dict_for_recovery[org][gene][0]))
else: # The gene is missing, we will look for it at a later stage in the program
item = '\t'.join([org, gene])
missing_list.append(item)
handle = open(outfile, 'w')
SeqIO.write(protein_match, handle,"fasta")
handle.close()
'''
handle = open(folder + 'rna_matches.fa', 'w')
SeqIO.write(rna_match, handle,"fasta")
handle.close()
handle = open(folder + 'pseudogene_matches.fa', 'w')
SeqIO.write(pseudogene_match, handle,"fasta")
handle.close()
handle = open(folder + 'missing_gene_block_genes.txt', 'w')
handle.write('\n'.join(missing_list))
handle.close()
# The goal here is to return the refrence fasta files, so that we can do self homolog dict, which is needed to detect
# fusion proteins later.
#ref_prot_outfile = folder + 'refrence_prot.fa'
#ref_rna_outfile = folder + 'refrence_rna.fa'
#print "len(gene_list)", len(gene_list)
handle = open(ref_prot_outfile, 'w')
SeqIO.write(refrence_prot, handle, "fasta")
#print "len(refrence_prot)", len(refrence_prot)
handle.close()
handle = open(folder + 'refrence_rna.fa', 'w')
SeqIO.write(refrence_rna, ref_rna_outfile, "fasta")
#print "len(refrence_rna)", len(refrence_rna)
handle.close()
return ref_prot_outfile, ref_rna_outfile, organism_dict_for_recovery
'''
# This function validates the gene_blocks in the references. any gene_blocks which cannot be located in their entirety are omitted
# it returns two dicts. one has just the gene_blocks and a list of their genes, the other has gene_blocks, list of genes with the
# type of product produced
def categorize_gene_blocks(ref_org, genbank_list, gene_block_dict):
ref_path = [i for i in genbank_list if i.split('/')[-1].split('.')[0] in ref_org]
print(("ref_org",ref_org))
print(("ref_path",ref_path))
# dict of the gene_blocks that are validated (could cind all the constituent genes in the reference)
result = {}
# dict of the gene_blocks that are validated (could cind all the constituent genes in the reference) and the type of product
result_categorized = {}
#print "gene_block_dict", len(gene_block_dict), gene_block_dict
prot_gene_blocks = []
mixed_gene_blocks = []
#ref_dict = return_genbank_dict(ref_path)
# basically need to update this for a list of refrence organisms... this is not done just yet.
ref_dict = return_genbank_dict(ref_path[0])
for gene_block in sorted(gene_block_dict.keys()):
gene_block_error = False
type_list = []
for gene in gene_block_dict[gene_block]:
try:
gene_type = ref_dict[gene][0].split('|')[7]
# if protein
if gene_type == 'Protein' or gene_type == 'Pseudogene':
type_list.append("%s:p" % gene)
# if RNA
else:
type_list.append("%s:r" % gene)
#print "RNA", gene, gene_type
except:
print(("gene_block", gene_block, "is not usable as an gene_block, I should remove it. The gene in error is in", gene))
gene_block_error = True
# no errors
if not gene_block_error:
result.update({gene_block: gene_block_dict[gene_block]})
result_categorized.update({gene_block:type_list})
#print "result", result
#print "result_categorized", result_categorized
return result, result_categorized
# There are a lot of things that will have to be done here:
# 1) include RNA coding genes
# 2) something else i can't think of right now
def return_self_homolog_dict(filtered_gene_block_dict, reference_fasta_prot, outfile, outfolder):
gene_to_gene_block_dict = {}
#print "filtered_gene_block_dict", filtered_gene_block_dict
for gene_block in list(filtered_gene_block_dict.keys()):
for gene in filtered_gene_block_dict[gene_block]:
gene_to_gene_block_dict.update({gene:gene_block})
#print "booyah", len(filtered_gene_block_dict), len(gene_to_gene_block_dict)
# set up databases for the different types of genes
# for proteins -p must be set to true
cmd = "formatdb -i %s -p T -o F" % (reference_fasta_prot)
os.system(cmd)
# for RNA genes -p must be set to false
#cmd = "formatdb -i %s -p F -o F" % (rna_file)
#os.system(cmd)
# blast each set of genes against itself
#cmd = "blastall -p blastp -a %i -i %s -d %s -e %s -o %s -m 9" % (os.sysconf("SC_NPROCESSORS_ONLN"), reference_fasta_prot, reference_fasta_prot, '1e-10', outfolder + 'self_prot.txt')
#print "outfolder + 'self_prot.txt'", outfolder + 'self_prot.txt'
cmd = "blastall -p blastp -a %i -i %s -d %s -e %s -o %s -m 9" % (1, reference_fasta_prot, reference_fasta_prot, '1e-10', outfolder + 'self_prot.txt')
os.system( cmd )
#cmd = "blastall -p blastn -a %i -i %s -d %s -e %s -o %s -m 9" % (os.sysconf("SC_NPROCESSORS_ONLN"), rna_file, rna_file, '1e-10', './recover/self_rna.txt')
#os.system( cmd )
# in this next section i will read in the resulting blast results, and construct a dictionary which will be keyed off gene name and provide a list
# of homologs from the gene_block set. This list will allow the program to filter out spurious results. We will miss fusions of homologous genes, but
# hopefully this will be a rare event in our dataset, untill this can be revised
lst = [i.strip() for i in open(outfolder + 'self_prot.txt').readlines() if i[0] != '#']
result = {}
for line in lst:
source, hit = line.split('\t')[0:2]
#source_annotation = source.split('|')[2]
source_annotation = source.split('|')[3]
#hit_annotation = hit.split('|')[2]
hit_annotation = hit.split('|')[3]
# we have two genes in the test set that are homologous
if source_annotation != hit_annotation:
if source_annotation not in list(result.keys()):
result.update({source_annotation: [hit_annotation]})
else:
result[source_annotation].append(hit_annotation)
if hit_annotation not in list(result.keys()):
result.update({hit_annotation: [source_annotation]})
else:
result[hit_annotation].append(source_annotation)
#print "result1", result
for key in list(result.keys()):
result.update({key: list(set(result[key]))})
#print "result2", result
json_handle = open(outfile, 'w')
json.dump(result, json_handle)
json_handle.close()
return result
def make_gene_block_individual_gene_fasta(validated_operon_dict, org_annotation_dict, refrence, outfolder):
# I use this in a later stage, but declare here.
intermediate_operon_fasta_folder = outfolder + 'cleaned_gene_fasta/'
if not os.path.isdir(intermediate_operon_fasta_folder):
os.makedirs(intermediate_operon_fasta_folder)
for operon in sorted(validated_operon_dict.keys()):
operon_folder = outfolder + operon + '/'
if not os.path.isdir(operon_folder):
os.makedirs(operon_folder)
for gene in validated_operon_dict[operon]:
gene_fasta_outfile = operon_folder + gene + '.fa'
result = []
for org in list(org_annotation_dict.keys()):
if gene in list(org_annotation_dict[org].keys()):
#print "org_annotation_dict[org][gene]", org_annotation_dict[org][gene]
outseq = SeqRecord(Seq(org_annotation_dict[org][gene][1]),
id=org_annotation_dict[org][gene][0], description = '')
result.append(outseq)
handle = open(gene_fasta_outfile, 'w')
SeqIO.write(result, handle, "fasta")
#print "len(refrence_prot)", len(refrence_prot)
handle.close()
# Step1: Cluster the annotated gene block genes using CD-HIT, 60% ident seems to work out just fine in my testing so far.
cd_hit_clustered_file = gene_fasta_outfile.replace('.', '_cd_hit.')
cmd = "cdhit -i %s -o %s -c %s -n %s >> cdhit_output_dump.txt" % (gene_fasta_outfile, cd_hit_clustered_file, '.6', '4')
os.system(cmd)
# Step2: From the represenative sequences in step 1, make a BLAST database so an all vs all BLAST search can be performed.
cmd = "formatdb -i %s -p T -o F" % (cd_hit_clustered_file)
os.system(cmd)
# Step3: Run all vs all BLAST to see if there are represenative genes that do not have homologs. These genes will be considered misannoated.
balst_reslt_file = operon_folder + gene + '_BLAST.txt'
#print "operon_folder + gene + _BLAST.txt", operon_folder + gene + '_BLAST.txt'
cmd = "blastall -p blastp -a %i -i %s -d %s -e %s -o %s -m 9" % (1, cd_hit_clustered_file, cd_hit_clustered_file, '1e-03', balst_reslt_file)
os.system( cmd )
# Step4: Read the tabular BLAST file resulting from step 3. Make a count dict keyed on the query (first field) and count the number of hits.
# Any query containing less than 2 hits is removed, since that gene is likely misannotated in the organism's genbank file. (genes will self hit)
hit_cnt_dict = {}
for hit in [i.split('\t')[0] for i in open(balst_reslt_file).readlines() if i[0] != '#']:
if hit in list(hit_cnt_dict.keys()):
hit_cnt_dict[hit] += 1
else:
hit_cnt_dict.update({hit:1})
# The situation is this: if you have the same number of hits as you do represenatives, then you do not want to remove singletons.
# This block of code allows this to be considered.
sum_dict_vals = 0
for i in list(hit_cnt_dict.keys()):
sum_dict_vals += hit_cnt_dict[i]
# Data from the cluster represenatives is not factored in at this point. I suspect that I should include this information.
# This way I will not remove clusters that have many constituent genes but are not homologous with others in the group.
sequences_to_remove = []
for i in list(hit_cnt_dict.keys()):
#print "i.split('|')[0]", i.split('|')[0], "refrence", refrence
if hit_cnt_dict[i] == 1 and len(hit_cnt_dict) != sum_dict_vals and i.split('|')[0] not in refrence:
#print "misannotation", i
sequences_to_remove.append(i)
# Step5: Filter the results, since at this point we have a list of genes that may be misannotated.
fasta_genes_to_keep = []
handle = open(cd_hit_clustered_file, "rU")
for rec in SeqIO.parse(handle, "fasta") :
if rec.id not in sequences_to_remove:
fasta_genes_to_keep.append(rec)
# Step6: Record the results of this pipeline for the individual gene.
final_outfile = operon_folder + gene + '_misannotations_removed.fa'
handle = open(final_outfile, 'w')
SeqIO.write(fasta_genes_to_keep, handle,"fasta")
handle.close()
# I am going to make a new folder to aggregate all of the final results, then cat them into a single file saved in the main folder
final_outfile = intermediate_operon_fasta_folder + gene + '.fa'
handle = open(final_outfile, 'w')
SeqIO.write(fasta_genes_to_keep, handle,"fasta")
handle.close()
# Step7: Aggregate the results of the previous steps into a single operon query file. This file has represenative gene block genes
# that are filtered for obvious possible misannotations. The result is likely throwing out more genes than necessary, but I prefer caution here.
final_result_file = outfolder + "operon_blast_query.fa"
cat_list = returnRecursiveDirFiles(intermediate_operon_fasta_folder)
with open(final_result_file, 'w') as handle:
for fname in cat_list:
with open(fname) as infile:
handle.write(infile.read())
def main():
start = time.time()
parsed_args = parser_code()
infolder, outfile, gene_block_file, num_proc, refrence_list, quiet = check_options(parsed_args)
if not quiet:
print((infolder, outfile, gene_block_file, num_proc, refrence_list, quiet))
# This section of code will return a parsed gene block file as a dictionary keyed by gene block name
#parsed_gene_block_file = './regulonDB/gene_block_names_and_genes_unfiltered.txt'
# later, this should be done through the cmd line interface... just running out of time here
gene_block_dict = parse_gene_block_file(gene_block_file)
# This section of code will return the full pathways to the genbank files of interest
genbank_list = returnRecursiveDirFiles(infolder)
# check to make sure that all gene blocks that we are looking at can be found in their entirety before we use them as
# part of the data set. The validated gene block dict contains information about the type (protein/RNA) of gene product.
validated_gene_block_dict, validated_gene_block_dict_more_info = categorize_gene_blocks(refrence_list, genbank_list, gene_block_dict)
# this is a list of all genes that we have in the dataset, reguardless of wether we can find them in the reference or not.
unvalidated_gene_list = []
# this is a list of all the genes that are in the validated dataset. we can find all the genes in the reference organism for the entire operon.
validated_gene_list = []
for gene_block in list(gene_block_dict.keys()):
unvalidated_gene_list = unvalidated_gene_list + gene_block_dict[gene_block]
for gene_block in list(validated_gene_block_dict.keys()):
validated_gene_list = validated_gene_list + validated_gene_block_dict[gene_block]
#ref_prot_outfile, ref_rna_outfile, org_annotation_dict = make_gene_block_fasta2(validated_gene_list, genbank_list, num_proc, outfolder, refrence)
make_gene_block_fasta2(validated_gene_list, genbank_list, num_proc, outfile, refrence_list)
# currently, we are only looking at protein sequences. The following code reflects this.
'''
self_homolog_dict = return_self_homolog_dict(validated_operon_dict, ref_prot_outfile, outfolder + 'operon_homolog_dict.json', outfolder)
# at this point i have the self homolog dict, i have a file that has all the e.coli operons validated (for one version of K-12)
# and i have the operons validated per operon, at least in terms of prot or RNA. now what i am going to do it to take the operons
# individually, make a folder for them in the outfolder, named for the operon, and create a fasta file of all the annotated examples.
# once this is done, then we will run CD hit on the individual genes to get a set of represenatives per gene. the clustering will be
# done on the a.a. seq at 70%. Then i would like to screne the clusters to try to remove possible mis-annotations.
make_gene_block_individual_gene_fasta(validated_operon_dict, org_annotation_dict, refrence, outfolder)
'''
# easy way to run this, using all the defaults that make sense
# ./make_operon_query.py -i /home/dave/Desktop/all_genbank -o ./operon_query.fa -p ./regulonDB/operon_names_and_genes.txt
# ./make_operon_query.py -i ./genomes/
if not quiet:
print((time.time() - start))
if __name__ == '__main__':
main()