-
Notifications
You must be signed in to change notification settings - Fork 0
/
proteome_all_analysis.py
80 lines (65 loc) · 2.37 KB
/
proteome_all_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import SeqUtils
import pandas as pd
from functools import partial
import time
from multiprocessing import Pool
aacids = list('CMFILVWYAGTSNQDEHRKP')
def get_aausage_proteome(seqrec):
# seqrec = db[seqrec_id]
features = seqrec.features
proteome = []
for feature in features:
qualifiers = feature.qualifiers
if (feature.type == 'CDS')and('translation' in qualifiers):
proteome.append(qualifiers['translation'][0])
#return the results ...
proteome = ''.join(proteome)
prot_len = float(len(proteome))
aa_freq = tuple(proteome.count(aa)/prot_len for aa in aacids)
#
return (int(prot_len),) + aa_freq
def analyse_genome(db,seqrec_id):
seqrec = db[seqrec_id]
pl_aa_freq = get_aausage_proteome(seqrec)
gc = SeqUtils.GC(seqrec.seq)
id = seqrec.id
return (id,gc) + pl_aa_freq
if __name__ == "__main__":
path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER_RELEASE69/genbank')
# dbx = SeqIO.index_db(os.path.join(path,'genbank.idx'))
dbx = SeqIO.index_db(os.path.join(path,"subset.idx"))
# subset.idx is a previously indexed db for the single genbank file with complete genomes only (the one ~5GB).
dat = pd.read_csv(os.path.join(path,"env_catalog_compgenome.dat"))
#
#
def do_work(seqrecid):
return analyse_genome(dbx,seqrecid)
#
work = list(dat.GenomicID)
#
print "launching processes, to do %d pieces of work ..."%len(work)
pool = Pool(processes=16)
results = pool.map(do_work, work)
#
print "file outputting ..."
with open("proteome_all.dat","w") as fp:
fp.write("GenomicID,GC,ProtLen,"+",".join(aacids)+"\n")
for result in results:
fp.write(','.join(str(item) for item in result)+'\n')
# DbxRefs,Description,FeaturesNum,GenomicID,
# GenomicLen,GenomicName,Keywords,NucsPresent,
# Organism_des,SourceDbxRefs,SourceOrganism,
# SourcePlasmid,SourceStrain,Taxonomy,BioProject,
# TaxonID,Organism_env,OptimumTemperature,
# TemperatureRange,OxygenReq,Habitat,Salinity,
# crit_NC,crit_WGS,crit_genlen,crit_features,
# crit_comp_genome,crit_plasmid
# takes 4 min 40 seconds to accomplish for 4 processes ...
# takes 1 min 50 seconds to accomplish for 12 processes ...
# takes 48 seconds to accomplish for 32 processes ...
# everything ob bib6