-
Notifications
You must be signed in to change notification settings - Fork 2
/
ncbi_taxonomy.py
84 lines (73 loc) · 2.57 KB
/
ncbi_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import urllib2
import Bio.Phylo as bp
from Bio.Phylo import Newick
import os
import tarfile
col_delimiter = '\t|\t'
row_delimiter = '\t|\n'
url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
tree_filename = 'ncbi_taxonomy.newick'
tree_format = 'newick'
data_dir = 'data/'
if not os.path.exists(data_dir): os.mkdir(data_dir)
# download the taxonomy archive
filename = os.path.join(data_dir, url.split('/')[-1])
if os.path.exists(filename):
print 'Using existing copy of %s' % filename
else:
print 'Downloading %s...' % filename
r = urllib2.urlopen(urllib2.Request(url))
assert r.geturl() == url
with open(filename, 'wb') as output_file:
output_file.write(r.read())
r.close()
# extract the text dump
for extract in ('nodes.dmp', 'names.dmp'):
if os.path.exists(os.path.join(data_dir, extract)):
print 'Using existing copy of %s' % extract
else:
print 'Extracting %s from %s...' % (extract, filename)
archive = tarfile.open(name=filename, mode='r:gz')
archive.extract(extract, path=data_dir)
archive.close()
# get names for all tax_ids from names.dmp
print 'Getting names...'
scientific_names = {}
common_names = {}
with open(os.path.join(data_dir, 'names.dmp')) as names_file:
for line in names_file:
line = line.rstrip(row_delimiter)
values = line.split(col_delimiter)
tax_id, name_txt, _, name_type = values[:4]
if name_type == 'scientific name':
scientific_names[tax_id] = name_txt
elif name_type == 'common name':
common_names[tax_id] = name_txt
# read all node info from nodes.dmp
print 'Reading taxonomy...'
nodes = {}
with open(os.path.join(data_dir, 'nodes.dmp')) as nodes_file:
for line in nodes_file:
line = line.rstrip(row_delimiter)
values = line.split(col_delimiter)
tax_id, parent_id = values[:2]
this_node = Newick.Clade(name=scientific_names[tax_id])
if tax_id in common_names:
this_node.comment = common_names[tax_id]
nodes[tax_id] = this_node
this_node.parent = parent_id
# create tree from nodes dictionary
print 'Building tree...'
for node_id, this_node in nodes.iteritems():
if node_id == this_node.parent:
root_node = this_node
print 'Found root.'
else:
parent_node = nodes[this_node.parent]
parent_node.clades.append(this_node)
del this_node.parent
tree = Newick.Tree(root=root_node)
# write tree to file
print 'Writing %s tree to %s...' % (tree_format, tree_filename)
bp.write([tree], tree_filename, tree_format)
print 'Done!'