Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix merged nodes #49

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 34 additions & 5 deletions make_ktaxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
#
#Required Parameters:
# --nodes X...........................nodes.dmp file
# --names X...........................names.dmp file
# --names X...........................names.dmp file
# --merged X..........................merged.dmp file
# --seqid2taxid X.....................seqid2taxid.map file
# -o, --output X......................output file with taxonomy info
#Optional Parameters:
Expand Down Expand Up @@ -67,6 +68,7 @@ def main():
help='nodes.dmp file from taxonomy')
parser.add_argument('--names',dest='names_file', required=True,
help='names.dmp file from taxonomy')
parser.add_argument('--merged', dest='merged_file', required=True, help='merged.dmp file from taxonomy')
parser.add_argument('--seqid2taxid',dest='s2t_file', required=True,
help='seqid2taxid.map file')
parser.add_argument('-o','--output',dest='out_file', required=True,
Expand All @@ -87,11 +89,11 @@ def main():
#STEP 1/5: PARSE NODES.DMP FILE
root_node = -1
taxid2node = {}
p_notsaved = {}
p_notsaved = {}
nodes_f = open(args.nodes_file,'r')
sys.stdout.write(">> STEP 1/5: Reading %s\n" % args.nodes_file)
sys.stdout.write("\t%0 nodes read")
count_nodes = 0
count_nodes = 0
for line in nodes_f:
count_nodes += 1
if (count_nodes % 100) == 0:
Expand All @@ -116,8 +118,26 @@ def main():
else:
#parent not linked
p_notsaved[curr_taxid] = curr_node
curr_node.p_taxid = parent_taxid
nodes_f.close()
curr_node.p_taxid = parent_taxid
nodes_f.close()

# Add outdated taxids that have been merged into updated taxids
# as a child of the updated taxid.
# The outdated taxid has the same rank as the updated taxid
updated_taxid_lookup = {}
with open(args.merged_file, 'r') as merged_f:
for line in merged_f:
outdated_taxid, updated_taxid, *_ = [taxid.strip() for taxid in line.split("|")]

updated_taxid_lookup[outdated_taxid] = updated_taxid

rank = taxid2node[updated_taxid].level_rank
curr_node = Tree(outdated_taxid, rank, parent=taxid2node[updated_taxid])
taxid2node[outdated_taxid] = curr_node
taxid2node[outdated_taxid].p_taxid = updated_taxid

taxid2node[updated_taxid].add_child(taxid2node[outdated_taxid])

sys.stdout.write("\r\t%i nodes read\n" % count_nodes)
sys.stdout.flush()
#Fix parents
Expand Down Expand Up @@ -193,6 +213,15 @@ def main():
elif "scientific name" in line:
save_taxids[taxid].name = name
names_f.close()

# add names to outdated taxids using their updated parent
for outdated_taxid, updated_taxid in updated_taxid_lookup.items():
try:
updated_name = save_taxids[updated_taxid].name
save_taxids[outdated_taxid].name = updated_name
except KeyError:
continue

sys.stdout.write("\r\t%i/%i names found\n" % (count_names, count_final))
sys.stdout.flush()
#STEP 5/5: PRINT NEW TAXONOMY
Expand Down