-
Notifications
You must be signed in to change notification settings - Fork 0
/
narrow_to_complete_genomes.py
131 lines (95 loc) · 4.88 KB
/
narrow_to_complete_genomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import Entrez as ncbi
import xml.etree.ElementTree as ET
import pandas as pd
import itertools
import numpy as np
import copy
# path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER/ftp.ncbi.nih.gov/refseq/release/bacteria/genbank')
path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER_RELEASE69/genbank')
env_fname = "env_catalog_bacter.dat"
descriptions_fname = "genbank.inventory.description"
env_df = pd.read_csv(os.path.join(path,env_fname))
descriptions = pd.read_csv(os.path.join(path,descriptions_fname))
# Expect some silly warnings, like this:
# Columns (11) have mixed types
bio_match = re.compile("BioProject:(.+)")
taxon_match = re.compile("taxon:(.+)")
def extract_bio_uid(dbxref):
uids = str(dbxref).strip().replace(';',' ').split(' ')
matched_uids = [bio_match.match(uid) for uid in uids if bio_match.match(uid)]
# bio_match.match(uid) is executed twice here, but who cares ...
if matched_uids:
accession = matched_uids[0].groups()[0]
uid = re.match("[A-Z]+(\d+)",accession).groups()[0]
return int(uid)
# return just a single BioProject uid, I believe that would be correct 100% of times
else:
return np.nan
def extract_taxon(dbxref):
uids = str(dbxref).strip().replace(';',' ').split(' ')
matched_txs = [taxon_match.match(uid) for uid in uids if taxon_match.match(uid)]
# taxon_match.match(uid) is executed twice here, but who cares ...
if matched_txs:
uid = matched_txs[0].groups()[0]
return int(uid)
# return just a single taxon uid, I believe that would be correct 100% of times
else:
return np.nan
# extract solely BioProject uid first ...
descriptions['BioProject'] = descriptions['DbxRefs'].apply(extract_bio_uid)
# extract solely Taxonomy uid first ...
descriptions['TaxonID'] = descriptions['SourceDbxRefs'].apply(extract_taxon)
# make BioProject UID an index
env_df = env_df.set_index('BioProject', drop=True, append=False)
# DB-style outter join of the tables
des = descriptions.join(env_df, on='BioProject', lsuffix='_des', rsuffix='_env')
# Entries with known temperature ...
des_temp = des[(des.OptimumTemperature.notnull())]
# get those that are not plasmids
noplasm_idx = des_temp.SourcePlasmid.isnull().nonzero()[0]
des_noplasmid = copy.deepcopy(des_temp.iloc[noplasm_idx])
# criteria for the Complete Genome ...
crit_NC = lambda name : True if (name[:3]=='NC_') else False
crit_comp_genome = lambda descr: True if re.search('complete genome',descr) else False
crit_features = lambda nf: True if (nf>=900) else False
crit_genlen = lambda genlen: True if (genlen>=300000) else False
crit_WGS = lambda keywords: False if re.search('WGS',keywords) else True
crit_plasmid = lambda descr: False if re.search('plasmid',descr) else True
# evaluate those criteria ...
des_noplasmid['crit_NC'] = des_noplasmid.GenomicID.apply(crit_NC)
des_noplasmid['crit_WGS'] = des_noplasmid.Keywords.apply(crit_WGS)
des_noplasmid['crit_genlen'] = des_noplasmid.GenomicLen.apply(crit_genlen)
des_noplasmid['crit_features'] = des_noplasmid.FeaturesNum.apply(crit_features)
des_noplasmid['crit_comp_genome'] = des_noplasmid.Description.apply(crit_comp_genome)
des_noplasmid['crit_plasmid'] = des_noplasmid.Description.apply(crit_plasmid)
# get those items that satisfy these criteria ...
criteria_to_satisfy = ['crit_NC', 'crit_WGS', 'crit_genlen', 'crit_features', 'crit_comp_genome', 'crit_plasmid']
indexes_satisfy = (des_noplasmid[criteria_to_satisfy].sum(axis=1)==len(criteria_to_satisfy)).nonzero()[0]
# des_valid = copy.deepcopy(des_noplasmid.iloc[indexes_satisfy].OptimumTemperature.notnull())
des_valid = copy.deepcopy(des_noplasmid.iloc[indexes_satisfy])
# now collapse non-unique things,
# there shouldn't be many of them, at this point
# [CHECK] - just 3 of them found for the RELEASE69 ...
cleaned_data = des_valid.drop_duplicates(subset='TaxonID')
check_bp = cleaned_data.duplicated(subset='BioProject').nonzero()[0].size
check_org1 = cleaned_data.duplicated(subset='Organism_des').nonzero()[0].size
check_org2 = cleaned_data.duplicated(subset='Organism_env').nonzero()[0].size
if (check_bp+check_org1+check_org2)==0:
print "Data looks extremely clean: no duplicates, complete genomes only, opt temps available ..."
elif check_bp:
print "WARNING: There are duplicated BioProject UIDs in the final data"
elif check_org1:
print "WARNING: There are duplicated Organisms in the final data"
elif check_org2:
print "WARNING: There are duplicated Organisms in the final data"
# some information summary ...
thermophiles_counts = (cleaned_data.OptimumTemperature>=50).nonzero()[0].size
total_counts = cleaned_data.shape[0]
print "Cleaned data summary: %d thermophiles (OGT>=50) out of %d species in total."
# GenomicID is the best candidate to be an index ...
cleaned_data.to_csv("env_catalog_compgenome.dat",index=False)